[llvm] a7b4dd4 - [LV] Don't create partial reductions if factor doesn't match accumulator (#158603)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 24 04:21:07 PDT 2025
Author: Florian Hahn
Date: 2025-09-24T12:21:03+01:00
New Revision: a7b4dd42bd00c57dbf42bdb1e2a351065c7282be
URL: https://github.com/llvm/llvm-project/commit/a7b4dd42bd00c57dbf42bdb1e2a351065c7282be
DIFF: https://github.com/llvm/llvm-project/commit/a7b4dd42bd00c57dbf42bdb1e2a351065c7282be.diff
LOG: [LV] Don't create partial reductions if factor doesn't match accumulator (#158603)
Check if the scale-factor of the accumulator is the same as the request
ScaleFactor in tryToCreatePartialReductions.
This prevents creating partial reductions if not all instructions in the
reduction chain form partial reductions. e.g. because we do not form a
partial reduction for the loop exit instruction.
Currently code-gen works fine, because the scale factor of
VPPartialReduction is not used during ::execute, but it means we compute
incorrect cost/register pressure, because the partial reduction won't
reduce to the specified scaling factor.
PR: https://github.com/llvm/llvm-project/pull/158603
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
llvm/lib/Transforms/Vectorize/VPlanUtils.h
llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ae9e70197d2cc..4e8be28df7934 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8183,8 +8183,11 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
- if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
- return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
+ if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) {
+ if (auto PartialRed =
+ tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()))
+ return PartialRed;
+ }
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8218,6 +8221,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
isa<VPPartialReductionRecipe>(BinOpRecipe))
std::swap(BinOp, Accumulator);
+ if (ScaleFactor !=
+ vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()))
+ return nullptr;
+
unsigned ReductionOpcode = Reduction->getOpcode();
if (ReductionOpcode == Instruction::Sub) {
auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 46ab7712e2671..07bfe7a896d86 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -395,20 +395,6 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
return Base::properlyDominates(ParentA, ParentB);
}
-/// Get the VF scaling factor applied to the recipe's output, if the recipe has
-/// one.
-static unsigned getVFScaleFactor(VPValue *R) {
- if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
- return RR->getVFScaleFactor();
- if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
- return RR->getVFScaleFactor();
- assert(
- (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
- VPInstruction::ReductionStartVector) &&
- "getting scaling factor of reduction-start-vector not implemented yet");
- return 1;
-}
-
bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI,
unsigned OverrideMaxNumRegs) const {
return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) {
@@ -571,7 +557,8 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
} else {
// The output from scaled phis and scaled reductions actually has
// fewer lanes than the VF.
- unsigned ScaleFactor = getVFScaleFactor(VPV);
+ unsigned ScaleFactor =
+ vputils::getVFScaleFactor(VPV->getDefiningRecipe());
ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
LLVM_DEBUG(if (VF != VFs[J]) {
dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 917aa01f8a926..eac0e705a877d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -141,6 +141,20 @@ VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) {
return I == DepthFirst.end() ? nullptr : cast<VPBasicBlock>(*I);
}
+unsigned vputils::getVFScaleFactor(VPRecipeBase *R) {
+ if (!R)
+ return 1;
+ if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
+ return RR->getVFScaleFactor();
+ if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
+ return RR->getVFScaleFactor();
+ assert(
+ (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
+ VPInstruction::ReductionStartVector) &&
+ "getting scaling factor of reduction-start-vector not implemented yet");
+ return 1;
+}
+
std::optional<VPValue *>
vputils::getRecipesForUncountableExit(VPlan &Plan,
SmallVectorImpl<VPRecipeBase *> &Recipes,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 33dd8efaec2db..0222b0aa81063 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -102,6 +102,10 @@ bool isUniformAcrossVFsAndUFs(VPValue *V);
/// exist.
VPBasicBlock *getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT);
+/// Get the VF scaling factor applied to the recipe's output, if the recipe has
+/// one.
+unsigned getVFScaleFactor(VPRecipeBase *R);
+
/// Returns the VPValue representing the uncountable exit comparison used by
/// AnyOf if the recipes it depends on can be traced back to live-ins and
/// the addresses (in GEP/PtrAdd form) of any (non-masked) load used in
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index f4784b6259ce1..229209e98af78 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -1381,8 +1381,8 @@ for.body: ; preds = %for.body.preheader,
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
}
-define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
-; CHECK-NEON-LABEL: define i32 @red_extended_add_chain(
+define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
+; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEON-NEXT: entry:
; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
@@ -1404,7 +1404,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <16 x i32> @llvm.vector.partial.reduce.add.v16i32.v16i32(<16 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1415,7 +1415,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEON: scalar.ph:
;
-; CHECK-SVE-LABEL: define i32 @red_extended_add_chain(
+; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain(
; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-SVE-NEXT: entry:
; CHECK-SVE-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
@@ -1452,7 +1452,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
;
-; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_chain(
+; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain(
; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-SVE-MAXBW-NEXT: entry:
; CHECK-SVE-MAXBW-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
@@ -1478,7 +1478,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
; CHECK-SVE-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 8 x i32> @llvm.vector.partial.reduce.add.nxv8i32.nxv8i32(<vscale x 8 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP7]])
+; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP7]]
; CHECK-SVE-MAXBW-NEXT: [[TMP8]] = add <vscale x 8 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
More information about the llvm-commits
mailing list