[llvm] dda2cd2 - [AArch64][SVE2] Change the cost of extends with S/URHADD to 0

Mon Aug 14 03:33:07 PDT 2023

Author: Kerry McLaughlin
Date: 2023-08-14T10:32:06Z
New Revision: dda2cd2505301aa626fcd3e8dea2a447227d00ca

URL: https://github.com/llvm/llvm-project/commit/dda2cd2505301aa626fcd3e8dea2a447227d00ca
DIFF: https://github.com/llvm/llvm-project/commit/dda2cd2505301aa626fcd3e8dea2a447227d00ca.diff

LOG: [AArch64][SVE2] Change the cost of extends with S/URHADD to 0

When SVE2 is enabled, we can combine an add of 1, add & shift right by 1
to a single s/urhadd instruction. If the operands to the adds are extended,
these extends will fold into the s/urhadd and their costs should be 0.

Reviewed By: dtemirbulatov

Differential Revision: https://reviews.llvm.org/D157628

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd-costs.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8bc9a0a1b78a64..7310e95220d622 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2044,6 +2044,72 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
   return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
 }
 
+// Where SVE2 is enabled, we can combine an add of 1, add & shift right by 1
+// to a single s/urhadd instruction. Some extends can be folded into the
+// instruction and will be 'free', e.g.
+//    %ld1 = load i8, ptr %a
+//    %zext1 = zext i8 %ld1 to i16
+//    %ld2 = load i8, ptr %b
+//    %zext2 = zext i8 %ld2 to i16
+//    %add1 = add nuw nsw i16 %zext1, 1
+//    %add2 = add nuw nsw i16 %add1, %zext2
+//    %shr = lshr i16 %add2, 1
+//    %trunc = trunc i16 %shr to i8
+//
+bool isExtShiftRightAdd(const Instruction *I, const Instruction *Ext, Type *Dst,
+                        Type *Src) {
+  // Check that the cast is doubling the source type.
+  if ((Src->getScalarSizeInBits() != Dst->getScalarSizeInBits() / 2) ||
+      I->getOpcode() != Instruction::Add || !I->hasOneUser())
+    return false;
+
+  // Check for the add/shift/trunc pattern if I is an add of a constant.
+  auto Op1 = dyn_cast<ConstantInt>(I->getOperand(1));
+  if (!Op1) {
+    // Otherwise, get the other operand and look for the same pattern
+    // if this is an add.
+    auto *Op = I->getOperand(0) == Ext ? I->getOperand(1) : I->getOperand(0);
+
+    I = dyn_cast<Instruction>(Op);
+    if (!I || I->getOpcode() != Instruction::Add || !I->hasOneUser())
+      return false;
+
+    Op1 = dyn_cast<ConstantInt>(I->getOperand(1));
+  }
+
+  if (!Op1)
+    return false;
+
+  auto ExtVal = isa<ZExtInst>(Ext) ? Op1->getZExtValue() : Op1->getSExtValue();
+  if (ExtVal != 1)
+    return false;
+
+  // The add should only have one user, a right shift of 1.
+  auto *Add = cast<Instruction>(*I->user_begin());
+  if (Add->getOpcode() != Instruction::Add || !Add->hasOneUser())
+    return false;
+
+  auto *LShr = cast<Instruction>(*Add->user_begin());
+  if (LShr->getOpcode() != Instruction::LShr || !LShr->hasOneUser())
+    return false;
+
+  auto *LShrOp1 = dyn_cast<ConstantInt>(LShr->getOperand(1));
+  ExtVal = isa<ZExtInst>(Ext) ? LShrOp1->getZExtValue()
+                              : LShrOp1->getSExtValue();
+  if (!LShrOp1 || LShrOp1->getZExtValue() != 1)
+    return false;
+
+  // Ensure the only user of the shift is a trunc which is casting
+  // back to the original element type.
+  auto *Trunc = cast<Instruction>(*LShr->user_begin());
+  if (Trunc->getOpcode() != Instruction::Trunc ||
+      Src->getScalarSizeInBits() !=
+          cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
+    return false;
+
+  return true;
+}
+
 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                                                  Type *Src,
                                                  TTI::CastContextHint CCH,
@@ -2068,6 +2134,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       } else // Others are free so long as isWideningInstruction returned true.
         return 0;
     }
+
+    // The cast will be free for the SVE2 s/urhadd instructions
+    if (ST->hasSVE2() && (isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+        isExtShiftRightAdd(SingleUser, I, Dst, Src))
+      return 0;
   }
 
   // TODO: Allow non-throughput costs that aren't binary.

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd-costs.ll
new file mode 100644
index 00000000000000..d1296951f4867c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd-costs.ll
@@ -0,0 +1,237 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve2 -sve-tail-folding=simple -debug-only=loop-vectorize -S 2>%t < %s
+; RUN: cat %t | FileCheck %s --check-prefix=CHECK-COST
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; SRHADD
+
+define void @srhadd_i8_zext_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction:   %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction:   %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction:   %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction:   %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction:   %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction:   %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 16 For instruction:   %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 16 For instruction:   %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction:   %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction:   %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction:   %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction:   %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 16 For instruction:   %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 16 For instruction:   %sext2 = sext i8 %ld2 to i16
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %ld1 = load i8, ptr %arrayidx1
+  %sext1 = sext i8 %ld1 to i16
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %ld2 = load i8, ptr %arrayidx2
+  %sext2 = sext i8 %ld2 to i16
+  %add1 = add nuw nsw i16 %sext1, 1
+  %add2 = add nuw nsw i16 %add1, %sext2
+  %shr = lshr i16 %add2, 1
+  %trunc = trunc i16 %shr to i8
+  %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %trunc, ptr %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+define void @srhadd_i16_zext_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction:   %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction:   %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction:   %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction:   %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction:   %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction:   %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction:   %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction:   %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction:   %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction:   %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %sext2 = sext i16 %ld2 to i32
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i16, ptr %a, i64 %indvars.iv
+  %ld1 = load i16, ptr %arrayidx1
+  %sext1 = sext i16 %ld1 to i32
+  %arrayidx2 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv
+  %ld2 = load i16, ptr %arrayidx2
+  %sext2 = sext i16 %ld2 to i32
+  %add1 = add nuw nsw i32 %sext1, 1
+  %add2 = add nuw nsw i32 %add1, %sext2
+  %shr = lshr i32 %add2, 1
+  %trunc = trunc i32 %shr to i16
+  %arrayidx3 = getelementptr inbounds i16, ptr %dst, i64 %indvars.iv
+  store i16 %trunc, ptr %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; URHADD
+
+define void @urhadd_i8_zext_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction:   %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction:   %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction:   %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction:   %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction:   %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction:   %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 16 For instruction:   %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 16 For instruction:   %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction:   %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction:   %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction:   %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction:   %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 16 For instruction:   %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 16 For instruction:   %zext2 = zext i8 %ld2 to i16
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %ld1 = load i8, ptr %arrayidx1
+  %zext1 = zext i8 %ld1 to i16
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %ld2 = load i8, ptr %arrayidx2
+  %zext2 = zext i8 %ld2 to i16
+  %add1 = add nuw nsw i16 %zext1, 1
+  %add2 = add nuw nsw i16 %add1, %zext2
+  %shr = lshr i16 %add2, 1
+  %trunc = trunc i16 %shr to i8
+  %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %trunc, ptr %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+define void @urhadd_i16_zext_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction:   %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction:   %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction:   %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction:   %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction:   %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction:   %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction:   %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction:   %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction:   %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction:   %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction:   %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction:   %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %zext2 = zext i16 %ld2 to i32
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i16, ptr %a, i64 %indvars.iv
+  %ld1 = load i16, ptr %arrayidx1
+  %zext1 = zext i16 %ld1 to i32
+  %arrayidx2 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv
+  %ld2 = load i16, ptr %arrayidx2
+  %zext2 = zext i16 %ld2 to i32
+  %add1 = add nuw nsw i32 %zext1, 1
+  %add2 = add nuw nsw i32 %add1, %zext2
+  %shr = lshr i32 %add2, 1
+  %trunc = trunc i32 %shr to i16
+  %arrayidx3 = getelementptr inbounds i16, ptr %dst, i64 %indvars.iv
+  store i16 %trunc, ptr %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd.ll
new file mode 100644
index 00000000000000..5fac0775214585
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd.ll
@@ -0,0 +1,129 @@
+; RUN: opt -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve2 -sve-tail-folding=simple -S < %s | FileCheck %s
+
+; SRHADD
+
+define void @srhadd_i8_zext_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+; CHECK-LABEL: @srhadd_i8_zext_i16(
+; CHECK: trunc <vscale x 16 x i16> {{.*}} to <vscale x 16 x i8>
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %ld1 = load i8, ptr %arrayidx1
+  %sext1 = sext i8 %ld1 to i16
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %ld2 = load i8, ptr %arrayidx2
+  %sext2 = sext i8 %ld2 to i16
+  %add1 = add nuw nsw i16 %sext1, 1
+  %add2 = add nuw nsw i16 %add1, %sext2
+  %shr = lshr i16 %add2, 1
+  %trunc = trunc i16 %shr to i8
+  %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %trunc, ptr %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+define void @srhadd_i16_zext_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+; CHECK-LABEL: @srhadd_i16_zext_i32(
+; CHECK: trunc <vscale x 8 x i32> {{.*}} to <vscale x 8 x i16>
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i16, ptr %a, i64 %indvars.iv
+  %ld1 = load i16, ptr %arrayidx1
+  %sext1 = sext i16 %ld1 to i32
+  %arrayidx2 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv
+  %ld2 = load i16, ptr %arrayidx2
+  %sext2 = sext i16 %ld2 to i32
+  %add1 = add nuw nsw i32 %sext1, 1
+  %add2 = add nuw nsw i32 %add1, %sext2
+  %shr = lshr i32 %add2, 1
+  %trunc = trunc i32 %shr to i16
+  %arrayidx3 = getelementptr inbounds i16, ptr %dst, i64 %indvars.iv
+  store i16 %trunc, ptr %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; URHADD
+
+define void @urhadd_i8_zext_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+; CHECK-LABEL: @urhadd_i8_zext_i16(
+; CHECK: trunc <vscale x 16 x i16> {{.*}} to <vscale x 16 x i8>
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %ld1 = load i8, ptr %arrayidx1
+  %zext1 = zext i8 %ld1 to i16
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %ld2 = load i8, ptr %arrayidx2
+  %zext2 = zext i8 %ld2 to i16
+  %add1 = add nuw nsw i16 %zext1, 1
+  %add2 = add nuw nsw i16 %add1, %zext2
+  %shr = lshr i16 %add2, 1
+  %trunc = trunc i16 %shr to i8
+  %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %trunc, ptr %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+define void @urhadd_i16_zext_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+; CHECK-LABEL: @urhadd_i16_zext_i32(
+; CHECK: trunc <vscale x 8 x i32> {{.*}} to <vscale x 8 x i16>
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i16, ptr %a, i64 %indvars.iv
+  %ld1 = load i16, ptr %arrayidx1
+  %zext1 = zext i16 %ld1 to i32
+  %arrayidx2 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv
+  %ld2 = load i16, ptr %arrayidx2
+  %zext2 = zext i16 %ld2 to i32
+  %add1 = add nuw nsw i32 %zext1, 1
+  %add2 = add nuw nsw i32 %add1, %zext2
+  %shr = lshr i32 %add2, 1
+  %trunc = trunc i32 %shr to i16
+  %arrayidx3 = getelementptr inbounds i16, ptr %dst, i64 %indvars.iv
+  store i16 %trunc, ptr %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}