[llvm-branch-commits] [llvm] [LV] Bundle partial reductions inside VPExpressionRecipe (PR #147302)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jul 7 06:44:32 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Sam Tebbs (SamTebbs33)
<details>
<summary>Changes</summary>
This PR bundles partial reductions inside the VPExpressionRecipe class.
Depends on https://github.com/llvm/llvm-project/pull/147255 .
---
Patch is 202.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147302.diff
16 Files Affected:
- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+2)
- (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+15-4)
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+1-1)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+6-2)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+23)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+7-4)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll (+55-35)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll (+2-2)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll (+98-98)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll (+42-42)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll (+299-279)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll (+14-22)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll (+12-12)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll (+11-20)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll (+4-7)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll (+26-26)
``````````diff
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 3cc0ea01953c3..338599a9bb5aa 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -223,6 +223,8 @@ class TargetTransformInfo {
/// Get the kind of extension that an instruction represents.
LLVM_ABI static PartialReductionExtendKind
getPartialReductionExtendKind(Instruction *I);
+ LLVM_ABI static PartialReductionExtendKind
+ getPartialReductionExtendKind(Instruction::CastOps CastOpc);
/// Construct a TTI object using a type implementing the \c Concept
/// API below.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index ba0d070bffe6d..5e9733a264e22 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1001,13 +1001,24 @@ InstructionCost TargetTransformInfo::getShuffleCost(
TargetTransformInfo::PartialReductionExtendKind
TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
- if (isa<SExtInst>(I))
- return PR_SignExtend;
- if (isa<ZExtInst>(I))
- return PR_ZeroExtend;
+ if (auto *Cast = dyn_cast<CastInst>(I))
+ return getPartialReductionExtendKind(Cast->getOpcode());
return PR_None;
}
+TargetTransformInfo::PartialReductionExtendKind
+TargetTransformInfo::getPartialReductionExtendKind(
+ Instruction::CastOps CastOpc) {
+ switch (CastOpc) {
+ case Instruction::CastOps::ZExt:
+ return PR_ZeroExtend;
+ case Instruction::CastOps::SExt:
+ return PR_SignExtend;
+ default:
+ return PR_None;
+ }
+}
+
TTI::CastContextHint
TargetTransformInfo::getCastContextHint(const Instruction *I) {
if (!I)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d9a367535baf4..5021a490839b2 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5294,7 +5294,7 @@ InstructionCost AArch64TTIImpl::getExtendedReductionCost(
EVT ResVT = TLI->getValueType(DL, ResTy);
if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
- VecVT.getSizeInBits() >= 64) {
+ VecVT.isFixedLengthVector() && VecVT.getSizeInBits() >= 64) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
// The legal cases are:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1bc926db301d8..30f3566332d79 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2470,7 +2470,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
- R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
+ R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
+ R->getVPDefID() == VPRecipeBase::VPPartialReductionSC;
}
static inline bool classof(const VPUser *U) {
@@ -2532,7 +2533,10 @@ class VPPartialReductionRecipe : public VPReductionRecipe {
Opcode(Opcode), VFScaleFactor(ScaleFactor) {
[[maybe_unused]] auto *AccumulatorRecipe =
getChainOp()->getDefiningRecipe();
- assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
+ // When cloning as part of a VPExpressionRecipe, the chain op could have
+ // been removed from the plan and so doesn't have a defining recipe.
+ assert((!AccumulatorRecipe ||
+ isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
"Unexpected operand order for partial reduction recipe");
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c20b1920c3791..6293129c74a1d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -164,6 +164,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
case VPBlendSC:
case VPReductionEVLSC:
+ case VPPartialReductionSC:
case VPReductionSC:
case VPScalarIVStepsSC:
case VPVectorPointerSC:
@@ -2678,6 +2679,23 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
case ExpressionTypes::ExtNegatedMulAccReduction:
case ExpressionTypes::ExtMulAccReduction: {
bool Negated = ExpressionType == ExpressionTypes::ExtNegatedMulAccReduction;
+ if (isa<VPPartialReductionRecipe>(ExpressionRecipes.back())) {
+ auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
+ auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
+ auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
+ unsigned Opcode =
+ ExpressionType == ExpressionTypes::ExtNegatedMulAccReduction
+ ? Instruction::Sub
+ : Instruction::Add;
+ return Ctx.TTI.getPartialReductionCost(
+ Opcode, Ctx.Types.inferScalarType(getOperand(0)),
+ Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
+ TargetTransformInfo::getPartialReductionExtendKind(
+ Ext0R->getOpcode()),
+ TargetTransformInfo::getPartialReductionExtendKind(
+ Ext1R->getOpcode()),
+ Mul->getOpcode(), Ctx.CostKind);
+ }
return Ctx.TTI.getMulAccReductionCost(
cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
Instruction::ZExt,
@@ -2710,6 +2728,7 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
O << " = ";
auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
+ bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
switch (ExpressionType) {
case ExpressionTypes::ExtendedReduction: {
@@ -2732,6 +2751,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
case ExpressionTypes::ExtNegatedMulAccReduction: {
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
O << " + ";
+ if (IsPartialReduction)
+ O << "partial.";
O << "reduce."
<< Instruction::getOpcodeName(
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
@@ -2758,6 +2779,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
case ExpressionTypes::ExtMulAccReduction: {
getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
O << " + ";
+ if (IsPartialReduction)
+ O << "partial.";
O << "reduce."
<< Instruction::getOpcodeName(
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a09d2037e97b4..8757b5635dbef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2899,6 +2899,7 @@ static VPExpressionRecipe *
tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
VPCostContext &Ctx, VFRange &Range) {
using namespace VPlanPatternMatch;
+ bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
if (Opcode != Instruction::Add)
@@ -2955,12 +2956,14 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
// Match reduce.add(mul(ext, ext)).
if (RecipeA && RecipeB &&
- (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
+ (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B ||
+ IsPartialReduction) &&
match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
- IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
- Instruction::CastOps::ZExt,
- MulR, RecipeA, RecipeB, nullptr, Sub)) {
+ (IsPartialReduction ||
+ IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
+ Instruction::CastOps::ZExt,
+ MulR, RecipeA, RecipeB, nullptr, Sub))) {
if (Sub)
return new VPExpressionRecipe(
RecipeA, RecipeB, MulR,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index b02b314ecbd67..46cdb73129181 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -34,10 +34,11 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP11]]
; CHECK-NEON-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -127,10 +128,11 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
-; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP15]], [[TMP19]]
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP17]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -200,10 +202,11 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP14]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -292,10 +295,11 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
-; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP15]], [[TMP20]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -364,11 +368,12 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP16]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -457,11 +462,12 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP22:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP19]], [[TMP22]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -532,11 +538,12 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP17]]
; CHECK-NEON-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -626,11 +633,12 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT: [[TMP22:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP22]], [[TMP23]]
; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP18]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/147302
More information about the llvm-branch-commits
mailing list