[llvm] [LV] Don't create partial reductions if factor doesn't match accumulator (PR #158603)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 15 04:05:27 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-vectorizers
@llvm/pr-subscribers-llvm-transforms
Author: Florian Hahn (fhahn)
<details>
<summary>Changes</summary>
Check if the scale-factor of the accumulator is the same as the request ScaleFactor in tryToCreatePartialReductions.
This prevents creating partial reductions if not all instructions in the reduction chain form partial reductions. e.g. because we do not form a partial reduction for the loop exit instruction.
Currently code-gen works fine, because the scale factor of VPPartialReduction is not used during ::execute, but it means we compute incorrect cost/register pressure, because the partial reduction won't reduce to the specified scaling factor.
---
Full diff: https://github.com/llvm/llvm-project/pull/158603.diff
5 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+9-2)
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+1-15)
- (modified) llvm/lib/Transforms/Vectorize/VPlanUtils.cpp (+14)
- (modified) llvm/lib/Transforms/Vectorize/VPlanUtils.h (+4)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll (+2-2)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 640a98c622f80..2caeb1bf8a695 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8141,8 +8141,11 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
- if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
- return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
+ if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) {
+ if (auto PartialRed =
+ tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()))
+ return PartialRed;
+ }
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8176,6 +8179,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
isa<VPPartialReductionRecipe>(BinOpRecipe))
std::swap(BinOp, Accumulator);
+ if (ScaleFactor !=
+ vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()))
+ return nullptr;
+
unsigned ReductionOpcode = Reduction->getOpcode();
if (ReductionOpcode == Instruction::Sub) {
auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index d400ceff7797c..b4e62d39173e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -394,20 +394,6 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
return Base::properlyDominates(ParentA, ParentB);
}
-/// Get the VF scaling factor applied to the recipe's output, if the recipe has
-/// one.
-static unsigned getVFScaleFactor(VPRecipeBase *R) {
- if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
- return RR->getVFScaleFactor();
- if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
- return RR->getVFScaleFactor();
- assert(
- (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
- VPInstruction::ReductionStartVector) &&
- "getting scaling factor of reduction-start-vector not implemented yet");
- return 1;
-}
-
bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI,
unsigned OverrideMaxNumRegs) const {
return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) {
@@ -569,7 +555,7 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
} else {
// The output from scaled phis and scaled reductions actually has
// fewer lanes than the VF.
- unsigned ScaleFactor = getVFScaleFactor(R);
+ unsigned ScaleFactor = vputils::getVFScaleFactor(R);
ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
LLVM_DEBUG(if (VF != VFs[J]) {
dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index ddc4ad1977401..94c492f52e89f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -141,3 +141,17 @@ VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) {
});
return I == DepthFirst.end() ? nullptr : cast<VPBasicBlock>(*I);
}
+
+unsigned vputils::getVFScaleFactor(VPRecipeBase *R) {
+ if (!R)
+ return 1;
+ if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
+ return RR->getVFScaleFactor();
+ if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
+ return RR->getVFScaleFactor();
+ assert(
+ (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
+ VPInstruction::ReductionStartVector) &&
+ "getting scaling factor of reduction-start-vector not implemented yet");
+ return 1;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 77c099b271717..0dd6885da6fdd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -101,6 +101,10 @@ bool isUniformAcrossVFsAndUFs(VPValue *V);
/// Returns the header block of the first, top-level loop, or null if none
/// exist.
VPBasicBlock *getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT);
+
+/// Get the VF scaling factor applied to the recipe's output, if the recipe has
+/// one.
+unsigned getVFScaleFactor(VPRecipeBase *R);
} // namespace vputils
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index c0995ec150c8d..f8b9d5db07fce 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -1404,7 +1404,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <16 x i32> @llvm.experimental.vector.partial.reduce.add.v16i32.v16i32(<16 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1478,7 +1478,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
; CHECK-SVE-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32(<vscale x 8 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP7]])
+; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP7]]
; CHECK-SVE-MAXBW-NEXT: [[TMP8]] = add <vscale x 8 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
``````````
</details>
https://github.com/llvm/llvm-project/pull/158603
More information about the llvm-commits
mailing list