[llvm] f5d153e - [VectorCombine] Fold binary op of reductions. (#121567)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 22 03:11:37 PST 2025
Author: Mikhail Gudim
Date: 2025-02-22T06:11:33-05:00
New Revision: f5d153ef26a9a206ab6eb4307de24c16b600c0d9
URL: https://github.com/llvm/llvm-project/commit/f5d153ef26a9a206ab6eb4307de24c16b600c0d9
DIFF: https://github.com/llvm/llvm-project/commit/f5d153ef26a9a206ab6eb4307de24c16b600c0d9.diff
LOG: [VectorCombine] Fold binary op of reductions. (#121567)
Replace binary of of two reductions with one reduction of the binary op
applied to vectors. For example:
```
%v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
%v1_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v1)
%res = add i32 %v0_red, %v1_red
```
gets transformed to:
```
%1 = add <16 x i32> %v0, %v1
%res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
```
Added:
llvm/test/Transforms/VectorCombine/ARM/fold-binop-of-reductions.ll
Modified:
llvm/include/llvm/Transforms/Utils/LoopUtils.h
llvm/lib/Transforms/Utils/LoopUtils.cpp
llvm/lib/Transforms/Vectorize/VectorCombine.cpp
llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index b4cd52fef70fd..1007b9d48fb72 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -365,6 +365,8 @@ constexpr Intrinsic::ID getReductionIntrinsicID(RecurKind RK);
/// Returns the arithmetic instruction opcode used when expanding a reduction.
unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID);
+/// Returns the reduction intrinsic id corresponding to the binary operation.
+Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc);
/// Returns the min/max intrinsic used when expanding a min/max reduction.
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID);
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 45915c10107b2..0506ea915a23f 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -957,6 +957,7 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
}
}
+// This is the inverse to getReductionForBinop
unsigned llvm::getArithmeticReductionInstruction(Intrinsic::ID RdxID) {
switch (RdxID) {
case Intrinsic::vector_reduce_fadd:
@@ -986,6 +987,25 @@ unsigned llvm::getArithmeticReductionInstruction(Intrinsic::ID RdxID) {
}
}
+// This is the inverse to getArithmeticReductionInstruction
+Intrinsic::ID llvm::getReductionForBinop(Instruction::BinaryOps Opc) {
+ switch (Opc) {
+ default:
+ break;
+ case Instruction::Add:
+ return Intrinsic::vector_reduce_add;
+ case Instruction::Mul:
+ return Intrinsic::vector_reduce_mul;
+ case Instruction::And:
+ return Intrinsic::vector_reduce_and;
+ case Instruction::Or:
+ return Intrinsic::vector_reduce_or;
+ case Instruction::Xor:
+ return Intrinsic::vector_reduce_xor;
+ }
+ return Intrinsic::not_intrinsic;
+}
+
Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID) {
switch (RdxID) {
default:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index cdb8853f7503c..2e20961b4c912 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -114,6 +114,7 @@ class VectorCombine {
bool scalarizeBinopOrCmp(Instruction &I);
bool scalarizeVPIntrinsic(Instruction &I);
bool foldExtractedCmps(Instruction &I);
+ bool foldBinopOfReductions(Instruction &I);
bool foldSingleElementStore(Instruction &I);
bool scalarizeLoadExtract(Instruction &I);
bool foldConcatOfBoolMasks(Instruction &I);
@@ -1242,6 +1243,121 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
return true;
}
+static void analyzeCostOfVecReduction(const IntrinsicInst &II,
+ TTI::TargetCostKind CostKind,
+ const TargetTransformInfo &TTI,
+ InstructionCost &CostBeforeReduction,
+ InstructionCost &CostAfterReduction) {
+ Instruction *Op0, *Op1;
+ auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
+ auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
+ unsigned ReductionOpc =
+ getArithmeticReductionInstruction(II.getIntrinsicID());
+ if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
+ bool IsUnsigned = isa<ZExtInst>(RedOp);
+ auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
+
+ CostBeforeReduction =
+ TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
+ TTI::CastContextHint::None, CostKind, RedOp);
+ CostAfterReduction =
+ TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
+ ExtType, FastMathFlags(), CostKind);
+ return;
+ }
+ if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
+ match(RedOp,
+ m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
+ match(Op0, m_ZExtOrSExt(m_Value())) &&
+ Op0->getOpcode() == Op1->getOpcode() &&
+ Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
+ (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
+ // Matched reduce.add(ext(mul(ext(A), ext(B)))
+ bool IsUnsigned = isa<ZExtInst>(Op0);
+ auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
+ VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
+
+ InstructionCost ExtCost =
+ TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
+ TTI::CastContextHint::None, CostKind, Op0);
+ InstructionCost MulCost =
+ TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
+ InstructionCost Ext2Cost =
+ TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
+ TTI::CastContextHint::None, CostKind, RedOp);
+
+ CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
+ CostAfterReduction =
+ TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind);
+ return;
+ }
+ CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
+ std::nullopt, CostKind);
+ return;
+}
+
+bool VectorCombine::foldBinopOfReductions(Instruction &I) {
+ Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
+ Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
+ if (BinOpOpc == Instruction::Sub)
+ ReductionIID = Intrinsic::vector_reduce_add;
+ if (ReductionIID == Intrinsic::not_intrinsic)
+ return false;
+
+ auto checkIntrinsicAndGetItsArgument = [](Value *V,
+ Intrinsic::ID IID) -> Value * {
+ auto *II = dyn_cast<IntrinsicInst>(V);
+ if (!II)
+ return nullptr;
+ if (II->getIntrinsicID() == IID && II->hasOneUse())
+ return II->getArgOperand(0);
+ return nullptr;
+ };
+
+ Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
+ if (!V0)
+ return false;
+ Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
+ if (!V1)
+ return false;
+
+ auto *VTy = cast<VectorType>(V0->getType());
+ if (V1->getType() != VTy)
+ return false;
+ const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
+ const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
+ unsigned ReductionOpc =
+ getArithmeticReductionInstruction(II0.getIntrinsicID());
+
+ InstructionCost OldCost = 0;
+ InstructionCost NewCost = 0;
+ InstructionCost CostOfRedOperand0 = 0;
+ InstructionCost CostOfRed0 = 0;
+ InstructionCost CostOfRedOperand1 = 0;
+ InstructionCost CostOfRed1 = 0;
+ analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
+ analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
+ OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
+ NewCost =
+ CostOfRedOperand0 + CostOfRedOperand1 +
+ TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
+ TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
+ if (NewCost >= OldCost || !NewCost.isValid())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
+ << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
+ << "\n");
+ Value *VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
+ if (auto *PDInst = dyn_cast<PossiblyDisjointInst>(&I))
+ if (auto *PDVectorBO = dyn_cast<PossiblyDisjointInst>(VectorBO))
+ PDVectorBO->setIsDisjoint(PDInst->isDisjoint());
+
+ Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
+ replaceValue(I, *Rdx);
+ return true;
+}
+
// Check if memory loc modified between two instrs in the same BB
static bool isMemModifiedBetween(BasicBlock::iterator Begin,
BasicBlock::iterator End,
@@ -3380,6 +3496,7 @@ bool VectorCombine::run() {
if (Instruction::isBinaryOp(Opcode)) {
MadeChange |= foldExtractExtract(I);
MadeChange |= foldExtractedCmps(I);
+ MadeChange |= foldBinopOfReductions(I);
}
break;
}
diff --git a/llvm/test/Transforms/VectorCombine/ARM/fold-binop-of-reductions.ll b/llvm/test/Transforms/VectorCombine/ARM/fold-binop-of-reductions.ll
new file mode 100644
index 0000000000000..ad362ef2bf900
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/ARM/fold-binop-of-reductions.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mattr=+mve.fp -passes=vector-combine -S | FileCheck %s
+
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+define i16 @add_of_reduce_add(<8 x i16> %v0, <8 x i16> %v1) {
+; CHECK-LABEL: define i16 @add_of_reduce_add(
+; CHECK-SAME: <8 x i16> [[V0:%.*]], <8 x i16> [[V1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i16> [[V0]], [[V1]]
+; CHECK-NEXT: [[RES:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP1]])
+; CHECK-NEXT: ret i16 [[RES]]
+;
+ %v0_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v0)
+ %v1_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v1)
+ %res = add i16 %v0_red, %v1_red
+ ret i16 %res
+}
+
+define i16 @reduce_zext_0(<8 x i8> %v0, <8 x i16> %v1) {
+; CHECK-LABEL: define i16 @reduce_zext_0(
+; CHECK-SAME: <8 x i8> [[V0:%.*]], <8 x i16> [[V1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT_:%.*]] = zext <8 x i8> [[V0]] to <8 x i16>
+; CHECK-NEXT: [[V0_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[ZEXT_]])
+; CHECK-NEXT: [[V1_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[V1]])
+; CHECK-NEXT: [[RES:%.*]] = add i16 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT: ret i16 [[RES]]
+;
+ %zext_ = zext <8 x i8> %v0 to <8 x i16>
+ %v0_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %zext_)
+ %v1_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v1)
+ %res = add i16 %v0_red, %v1_red
+ ret i16 %res
+}
+
+define i16 @reduce_zext_1(<8 x i16> %v0, <8 x i8> %v1) {
+; CHECK-LABEL: define i16 @reduce_zext_1(
+; CHECK-SAME: <8 x i16> [[V0:%.*]], <8 x i8> [[V1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ZEXT_:%.*]] = zext <8 x i8> [[V1]] to <8 x i16>
+; CHECK-NEXT: [[V0_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[V0]])
+; CHECK-NEXT: [[V1_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[ZEXT_]])
+; CHECK-NEXT: [[RES:%.*]] = add i16 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT: ret i16 [[RES]]
+;
+ %zext_ = zext <8 x i8> %v1 to <8 x i16>
+ %v0_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v0)
+ %v1_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %zext_)
+ %res = add i16 %v0_red, %v1_red
+ ret i16 %res
+}
+
+define i32 @mul_acc_pattern_0(<8 x i8> %v0, <8 x i8> %v1, <8 x i32> %v2) {
+; CHECK-LABEL: define i32 @mul_acc_pattern_0(
+; CHECK-SAME: <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> [[V2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[INNER_ZEXT_0:%.*]] = zext <8 x i8> [[V0]] to <8 x i16>
+; CHECK-NEXT: [[INNER_ZEXT_1:%.*]] = zext <8 x i8> [[V1]] to <8 x i16>
+; CHECK-NEXT: [[MUL_:%.*]] = mul <8 x i16> [[INNER_ZEXT_0]], [[INNER_ZEXT_1]]
+; CHECK-NEXT: [[ZEXT_:%.*]] = zext <8 x i16> [[MUL_]] to <8 x i32>
+; CHECK-NEXT: [[RED_MUL_ACC_PATTERN:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[ZEXT_]])
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V2]])
+; CHECK-NEXT: [[RES:%.*]] = add i32 [[RED_MUL_ACC_PATTERN]], [[RED]]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %inner_zext_0 = zext <8 x i8> %v0 to <8 x i16>
+ %inner_zext_1 = zext <8 x i8> %v1 to <8 x i16>
+ %mul_ = mul <8 x i16> %inner_zext_0, %inner_zext_1
+ %zext_ = zext <8 x i16> %mul_ to <8 x i32>
+ %red_mul_acc_pattern = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %zext_)
+ %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v2)
+ %res = add i32 %red_mul_acc_pattern, %red
+ ret i32 %res
+}
+
+define i32 @mul_acc_pattern_1(<8 x i8> %v0, <8 x i8> %v1, <8 x i32> %v2) {
+; CHECK-LABEL: define i32 @mul_acc_pattern_1(
+; CHECK-SAME: <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> [[V2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[INNER_ZEXT_0:%.*]] = zext <8 x i8> [[V0]] to <8 x i16>
+; CHECK-NEXT: [[INNER_ZEXT_1:%.*]] = zext <8 x i8> [[V1]] to <8 x i16>
+; CHECK-NEXT: [[MUL_:%.*]] = mul <8 x i16> [[INNER_ZEXT_0]], [[INNER_ZEXT_1]]
+; CHECK-NEXT: [[ZEXT_:%.*]] = zext <8 x i16> [[MUL_]] to <8 x i32>
+; CHECK-NEXT: [[RED_MUL_ACC_PATTERN:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[ZEXT_]])
+; CHECK-NEXT: [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V2]])
+; CHECK-NEXT: [[RES:%.*]] = add i32 [[RED]], [[RED_MUL_ACC_PATTERN]]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+ %inner_zext_0 = zext <8 x i8> %v0 to <8 x i16>
+ %inner_zext_1 = zext <8 x i8> %v1 to <8 x i16>
+ %mul_ = mul <8 x i16> %inner_zext_0, %inner_zext_1
+ %zext_ = zext <8 x i16> %mul_ to <8 x i32>
+ %red_mul_acc_pattern = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %zext_)
+ %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v2)
+ %res = add i32 %red, %red_mul_acc_pattern
+ ret i32 %res
+}
diff --git a/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll b/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
index 86f17cdfb79b4..5f29af9de5a39 100644
--- a/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
@@ -1,12 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
define i32 @add_of_reduce_add(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @add_of_reduce_add(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
@@ -31,9 +30,8 @@ define i32 @sub_of_reduce_add(<16 x i32> %v0, <16 x i32> %v1) {
define i32 @mul_of_reduce_mul(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @mul_of_reduce_mul(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT: [[RES:%.*]] = mul i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT: [[TMP1:%.*]] = mul <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v0)
@@ -45,9 +43,8 @@ define i32 @mul_of_reduce_mul(<16 x i32> %v0, <16 x i32> %v1) {
define i32 @and_of_reduce_and(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @and_of_reduce_and(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT: [[RES:%.*]] = and i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %v0)
@@ -59,9 +56,8 @@ define i32 @and_of_reduce_and(<16 x i32> %v0, <16 x i32> %v1) {
define i32 @or_of_reduce_or(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @or_of_reduce_or(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT: [[RES:%.*]] = or i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT: [[TMP1:%.*]] = or <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v0)
@@ -73,9 +69,8 @@ define i32 @or_of_reduce_or(<16 x i32> %v0, <16 x i32> %v1) {
define i32 @xor_of_reduce_xor(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @xor_of_reduce_xor(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT: [[RES:%.*]] = xor i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT: [[TMP1:%.*]] = xor <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %v0)
@@ -161,9 +156,8 @@ define i32 @multiple_use_of_reduction_1(<16 x i32> %v0, <16 x i32> %v1, ptr %p)
define i32 @do_not_preserve_overflow_flags(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @do_not_preserve_overflow_flags(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT: [[RES:%.*]] = add nuw nsw i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
@@ -175,9 +169,8 @@ define i32 @do_not_preserve_overflow_flags(<16 x i32> %v0, <16 x i32> %v1) {
define i32 @preserve_disjoint_flags(<16 x i32> %v0, <16 x i32> %v1) {
; CHECK-LABEL: define i32 @preserve_disjoint_flags(
; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT: [[RES:%.*]] = or disjoint i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT: [[TMP1:%.*]] = or disjoint <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v0)
@@ -189,9 +182,8 @@ define i32 @preserve_disjoint_flags(<16 x i32> %v0, <16 x i32> %v1) {
define i32 @add_of_reduce_add_vscale(<vscale x 16 x i32> %v0, <vscale x 16 x i32> %v1) {
; CHECK-LABEL: define i32 @add_of_reduce_add_vscale(
; CHECK-SAME: <vscale x 16 x i32> [[V0:%.*]], <vscale x 16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT: [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[V0]])
-; CHECK-NEXT: [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[V1]])
-; CHECK-NEXT: [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT: [[TMP1:%.*]] = add <vscale x 16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP1]])
; CHECK-NEXT: ret i32 [[RES]]
;
%v0_red = tail call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %v0)
More information about the llvm-commits
mailing list