[llvm] dda2cd2 - [AArch64][SVE2] Change the cost of extends with S/URHADD to 0
Kerry McLaughlin via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 14 03:33:07 PDT 2023
Author: Kerry McLaughlin
Date: 2023-08-14T10:32:06Z
New Revision: dda2cd2505301aa626fcd3e8dea2a447227d00ca
URL: https://github.com/llvm/llvm-project/commit/dda2cd2505301aa626fcd3e8dea2a447227d00ca
DIFF: https://github.com/llvm/llvm-project/commit/dda2cd2505301aa626fcd3e8dea2a447227d00ca.diff
LOG: [AArch64][SVE2] Change the cost of extends with S/URHADD to 0
When SVE2 is enabled, we can combine an add of 1, add & shift right by 1
to a single s/urhadd instruction. If the operands to the adds are extended,
these extends will fold into the s/urhadd and their costs should be 0.
Reviewed By: dtemirbulatov
Differential Revision: https://reviews.llvm.org/D157628
Added:
llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd-costs.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd.ll
Modified:
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8bc9a0a1b78a64..7310e95220d622 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2044,6 +2044,72 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
}
+// Where SVE2 is enabled, we can combine an add of 1, add & shift right by 1
+// to a single s/urhadd instruction. Some extends can be folded into the
+// instruction and will be 'free', e.g.
+// %ld1 = load i8, ptr %a
+// %zext1 = zext i8 %ld1 to i16
+// %ld2 = load i8, ptr %b
+// %zext2 = zext i8 %ld2 to i16
+// %add1 = add nuw nsw i16 %zext1, 1
+// %add2 = add nuw nsw i16 %add1, %zext2
+// %shr = lshr i16 %add2, 1
+// %trunc = trunc i16 %shr to i8
+//
+bool isExtShiftRightAdd(const Instruction *I, const Instruction *Ext, Type *Dst,
+ Type *Src) {
+ // Check that the cast is doubling the source type.
+ if ((Src->getScalarSizeInBits() != Dst->getScalarSizeInBits() / 2) ||
+ I->getOpcode() != Instruction::Add || !I->hasOneUser())
+ return false;
+
+ // Check for the add/shift/trunc pattern if I is an add of a constant.
+ auto Op1 = dyn_cast<ConstantInt>(I->getOperand(1));
+ if (!Op1) {
+ // Otherwise, get the other operand and look for the same pattern
+ // if this is an add.
+ auto *Op = I->getOperand(0) == Ext ? I->getOperand(1) : I->getOperand(0);
+
+ I = dyn_cast<Instruction>(Op);
+ if (!I || I->getOpcode() != Instruction::Add || !I->hasOneUser())
+ return false;
+
+ Op1 = dyn_cast<ConstantInt>(I->getOperand(1));
+ }
+
+ if (!Op1)
+ return false;
+
+ auto ExtVal = isa<ZExtInst>(Ext) ? Op1->getZExtValue() : Op1->getSExtValue();
+ if (ExtVal != 1)
+ return false;
+
+ // The add should only have one user, a right shift of 1.
+ auto *Add = cast<Instruction>(*I->user_begin());
+ if (Add->getOpcode() != Instruction::Add || !Add->hasOneUser())
+ return false;
+
+ auto *LShr = cast<Instruction>(*Add->user_begin());
+ if (LShr->getOpcode() != Instruction::LShr || !LShr->hasOneUser())
+ return false;
+
+ auto *LShrOp1 = dyn_cast<ConstantInt>(LShr->getOperand(1));
+ ExtVal = isa<ZExtInst>(Ext) ? LShrOp1->getZExtValue()
+ : LShrOp1->getSExtValue();
+ if (!LShrOp1 || LShrOp1->getZExtValue() != 1)
+ return false;
+
+ // Ensure the only user of the shift is a trunc which is casting
+ // back to the original element type.
+ auto *Trunc = cast<Instruction>(*LShr->user_begin());
+ if (Trunc->getOpcode() != Instruction::Trunc ||
+ Src->getScalarSizeInBits() !=
+ cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
+ return false;
+
+ return true;
+}
+
InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
Type *Src,
TTI::CastContextHint CCH,
@@ -2068,6 +2134,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
} else // Others are free so long as isWideningInstruction returned true.
return 0;
}
+
+ // The cast will be free for the SVE2 s/urhadd instructions
+ if (ST->hasSVE2() && (isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+ isExtShiftRightAdd(SingleUser, I, Dst, Src))
+ return 0;
}
// TODO: Allow non-throughput costs that aren't binary.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd-costs.ll
new file mode 100644
index 00000000000000..d1296951f4867c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd-costs.ll
@@ -0,0 +1,237 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve2 -sve-tail-folding=simple -debug-only=loop-vectorize -S 2>%t < %s
+; RUN: cat %t | FileCheck %s --check-prefix=CHECK-COST
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; SRHADD
+
+define void @srhadd_i8_zext_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction: %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction: %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction: %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction: %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction: %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction: %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 16 For instruction: %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 16 For instruction: %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction: %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction: %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction: %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction: %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction: %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction: %sext2 = sext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 16 For instruction: %sext1 = sext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 16 For instruction: %sext2 = sext i8 %ld2 to i16
+
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+ %ld1 = load i8, ptr %arrayidx1
+ %sext1 = sext i8 %ld1 to i16
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+ %ld2 = load i8, ptr %arrayidx2
+ %sext2 = sext i8 %ld2 to i16
+ %add1 = add nuw nsw i16 %sext1, 1
+ %add2 = add nuw nsw i16 %add1, %sext2
+ %shr = lshr i16 %add2, 1
+ %trunc = trunc i16 %shr to i8
+ %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+ store i8 %trunc, ptr %arrayidx3
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+ br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+define void @srhadd_i16_zext_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction: %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction: %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction: %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction: %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction: %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction: %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction: %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction: %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction: %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction: %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %sext2 = sext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction: %sext1 = sext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction: %sext2 = sext i16 %ld2 to i32
+
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx1 = getelementptr inbounds i16, ptr %a, i64 %indvars.iv
+ %ld1 = load i16, ptr %arrayidx1
+ %sext1 = sext i16 %ld1 to i32
+ %arrayidx2 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv
+ %ld2 = load i16, ptr %arrayidx2
+ %sext2 = sext i16 %ld2 to i32
+ %add1 = add nuw nsw i32 %sext1, 1
+ %add2 = add nuw nsw i32 %add1, %sext2
+ %shr = lshr i32 %add2, 1
+ %trunc = trunc i32 %shr to i16
+ %arrayidx3 = getelementptr inbounds i16, ptr %dst, i64 %indvars.iv
+ store i16 %trunc, ptr %arrayidx3
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+ br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+; URHADD
+
+define void @urhadd_i8_zext_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction: %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction: %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction: %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction: %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction: %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction: %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 16 For instruction: %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 16 For instruction: %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction: %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction: %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction: %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction: %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction: %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction: %zext2 = zext i8 %ld2 to i16
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 16 For instruction: %zext1 = zext i8 %ld1 to i16
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 16 For instruction: %zext2 = zext i8 %ld2 to i16
+
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+ %ld1 = load i8, ptr %arrayidx1
+ %zext1 = zext i8 %ld1 to i16
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+ %ld2 = load i8, ptr %arrayidx2
+ %zext2 = zext i8 %ld2 to i16
+ %add1 = add nuw nsw i16 %zext1, 1
+ %add2 = add nuw nsw i16 %add1, %zext2
+ %shr = lshr i16 %add2, 1
+ %trunc = trunc i16 %shr to i8
+ %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+ store i8 %trunc, ptr %arrayidx3
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+ br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+define void @urhadd_i16_zext_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 1 For instruction: %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction: %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 2 For instruction: %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction: %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 4 For instruction: %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction: %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF 8 For instruction: %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction: %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction: %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction: %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction: %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %zext2 = zext i16 %ld2 to i32
+
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction: %zext1 = zext i16 %ld1 to i32
+; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction: %zext2 = zext i16 %ld2 to i32
+
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx1 = getelementptr inbounds i16, ptr %a, i64 %indvars.iv
+ %ld1 = load i16, ptr %arrayidx1
+ %zext1 = zext i16 %ld1 to i32
+ %arrayidx2 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv
+ %ld2 = load i16, ptr %arrayidx2
+ %zext2 = zext i16 %ld2 to i32
+ %add1 = add nuw nsw i32 %zext1, 1
+ %add2 = add nuw nsw i32 %add1, %zext2
+ %shr = lshr i32 %add2, 1
+ %trunc = trunc i32 %shr to i16
+ %arrayidx3 = getelementptr inbounds i16, ptr %dst, i64 %indvars.iv
+ store i16 %trunc, ptr %arrayidx3
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+ br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd.ll
new file mode 100644
index 00000000000000..5fac0775214585
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd.ll
@@ -0,0 +1,129 @@
+; RUN: opt -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve2 -sve-tail-folding=simple -S < %s | FileCheck %s
+
+; SRHADD
+
+define void @srhadd_i8_zext_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+; CHECK-LABEL: @srhadd_i8_zext_i16(
+; CHECK: trunc <vscale x 16 x i16> {{.*}} to <vscale x 16 x i8>
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+ %ld1 = load i8, ptr %arrayidx1
+ %sext1 = sext i8 %ld1 to i16
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+ %ld2 = load i8, ptr %arrayidx2
+ %sext2 = sext i8 %ld2 to i16
+ %add1 = add nuw nsw i16 %sext1, 1
+ %add2 = add nuw nsw i16 %add1, %sext2
+ %shr = lshr i16 %add2, 1
+ %trunc = trunc i16 %shr to i8
+ %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+ store i8 %trunc, ptr %arrayidx3
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+ br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+define void @srhadd_i16_zext_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+; CHECK-LABEL: @srhadd_i16_zext_i32(
+; CHECK: trunc <vscale x 8 x i32> {{.*}} to <vscale x 8 x i16>
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx1 = getelementptr inbounds i16, ptr %a, i64 %indvars.iv
+ %ld1 = load i16, ptr %arrayidx1
+ %sext1 = sext i16 %ld1 to i32
+ %arrayidx2 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv
+ %ld2 = load i16, ptr %arrayidx2
+ %sext2 = sext i16 %ld2 to i32
+ %add1 = add nuw nsw i32 %sext1, 1
+ %add2 = add nuw nsw i32 %add1, %sext2
+ %shr = lshr i32 %add2, 1
+ %trunc = trunc i32 %shr to i16
+ %arrayidx3 = getelementptr inbounds i16, ptr %dst, i64 %indvars.iv
+ store i16 %trunc, ptr %arrayidx3
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+ br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+; URHADD
+
+define void @urhadd_i8_zext_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+; CHECK-LABEL: @urhadd_i8_zext_i16(
+; CHECK: trunc <vscale x 16 x i16> {{.*}} to <vscale x 16 x i8>
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+ %ld1 = load i8, ptr %arrayidx1
+ %zext1 = zext i8 %ld1 to i16
+ %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+ %ld2 = load i8, ptr %arrayidx2
+ %zext2 = zext i8 %ld2 to i16
+ %add1 = add nuw nsw i16 %zext1, 1
+ %add2 = add nuw nsw i16 %add1, %zext2
+ %shr = lshr i16 %add2, 1
+ %trunc = trunc i16 %shr to i8
+ %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+ store i8 %trunc, ptr %arrayidx3
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+ br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+
+define void @urhadd_i16_zext_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+; CHECK-LABEL: @urhadd_i16_zext_i32(
+; CHECK: trunc <vscale x 8 x i32> {{.*}} to <vscale x 8 x i16>
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx1 = getelementptr inbounds i16, ptr %a, i64 %indvars.iv
+ %ld1 = load i16, ptr %arrayidx1
+ %zext1 = zext i16 %ld1 to i32
+ %arrayidx2 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv
+ %ld2 = load i16, ptr %arrayidx2
+ %zext2 = zext i16 %ld2 to i32
+ %add1 = add nuw nsw i32 %zext1, 1
+ %add2 = add nuw nsw i32 %add1, %zext2
+ %shr = lshr i32 %add2, 1
+ %trunc = trunc i32 %shr to i16
+ %arrayidx3 = getelementptr inbounds i16, ptr %dst, i64 %indvars.iv
+ store i16 %trunc, ptr %arrayidx3
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+ br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
More information about the llvm-commits
mailing list