[llvm] [NFCI][VPlan] Split initial mem-widening into a separate transformation (PR #182592)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 20 13:18:53 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-vectorizers
Author: Andrei Elovikov (eas)
<details>
<summary>Changes</summary>
Preparation change before implementing stride-multiversioning as a VPlan-based transformation. Might help
https://github.com/llvm/llvm-project/pull/147297/ as well.
---
Full diff: https://github.com/llvm/llvm-project/pull/182592.diff
5 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+92-31)
- (modified) llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h (+14-12)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+4)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll (+3-2)
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll (+1)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6299e8c2dbd32..c0694ebcad464 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8088,13 +8088,9 @@ VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
return tryToWidenCall(VPI, Range);
Instruction *Instr = R->getUnderlyingInstr();
- if (VPI->getOpcode() == Instruction::Store)
- if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr)))
- return tryToWidenHistogram(*HistInfo, VPI);
-
- if (VPI->getOpcode() == Instruction::Load ||
- VPI->getOpcode() == Instruction::Store)
- return tryToWidenMemory(VPI, Range);
+ assert(!is_contained({Instruction::Load, Instruction::Store},
+ VPI->getOpcode()) &&
+ "Should have been handled prior to this!");
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8265,9 +8261,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
HeaderVPBB);
- auto *MiddleVPBB = Plan->getMiddleBlock();
- VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
-
// Collect blocks that need predication for in-loop reduction recipes.
DenseSet<BasicBlock *> BlocksNeedingPredication;
for (BasicBlock *BB : OrigLoop->blocks())
@@ -8277,13 +8270,23 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPlanTransforms::createInLoopReductionRecipes(*Plan, BlocksNeedingPredication,
Range.Start);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
+ OrigLoop);
+
+ RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::makeMemOpWideningDecisions, *Plan,
+ Range, RecipeBuilder, CostCtx);
+
// Now process all other blocks and instructions.
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
// Convert input VPInstructions to widened recipes.
for (VPRecipeBase &R : make_early_inc_range(
make_range(VPBB->getFirstNonPhi(), VPBB->end()))) {
- // Skip recipes that do not need transforming.
- if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe>(&R))
+ // Skip recipes that do not need transforming or have already been
+ // transformed.
+ if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe,
+ VPReplicateRecipe, VPWidenLoadRecipe, VPWidenStoreRecipe,
+ VPVectorPointerRecipe, VPVectorEndPointerRecipe,
+ VPHistogramRecipe>(&R))
continue;
auto *VPI = cast<VPInstruction>(&R);
if (!VPI->getUnderlyingValue())
@@ -8295,23 +8298,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
Instruction *Instr = cast<Instruction>(VPI->getUnderlyingValue());
Builder.setInsertPoint(VPI);
- // The stores with invariant address inside the loop will be deleted, and
- // in the exit block, a uniform store recipe will be created for the final
- // invariant store of the reduction.
- StoreInst *SI;
- if ((SI = dyn_cast<StoreInst>(Instr)) &&
- Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
- // Only create recipe for the final invariant store of the reduction.
- if (Legal->isInvariantStoreOfReduction(SI)) {
- auto *Recipe = new VPReplicateRecipe(
- SI, VPI->operandsWithoutMask(), true /* IsUniform */,
- nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
- Recipe->insertBefore(*MiddleVPBB, MBIP);
- }
- R.eraseFromParent();
- continue;
- }
-
VPRecipeBase *Recipe =
RecipeBuilder.tryToCreateWidenNonPhiRecipe(VPI, Range);
if (!Recipe)
@@ -8378,8 +8364,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
- OrigLoop);
RUN_VPLAN_PASS(VPlanTransforms::createPartialReductions, *Plan, CostCtx,
Range);
RUN_VPLAN_PASS(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx,
@@ -10026,3 +10010,80 @@ void LoopVectorizePass::printPipeline(
OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
OS << '>';
}
+
+void VPlanTransforms::makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
+ VPRecipeBuilder &RecipeBuilder,
+ VPCostContext &CostCtx) {
+ // Filter out scalar VPlan.
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) { return VF.isScalar(); }, Range))
+ return;
+
+ // Scan the body of the loop in a topological order to visit each basic block
+ // after having visited its predecessor basic blocks.
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ HeaderVPBB);
+
+ // Collect all loads/stores first. We will start with ones having simpler
+ // decisions followed by more complex ones that are potentially
+ // guided/dependent on the simpler ones.
+ SmallVector<VPInstruction *> MemOps;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ for (VPRecipeBase &R : *VPBB) {
+ auto *VPI = dyn_cast<VPInstruction>(&R);
+ if (VPI && VPI->getUnderlyingValue() &&
+ is_contained({Instruction::Load, Instruction::Store},
+ VPI->getOpcode()))
+ MemOps.push_back(VPI);
+ }
+ }
+
+ auto *Legal = CostCtx.CM.Legal;
+
+ auto *MiddleVPBB = Plan.getMiddleBlock();
+ VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
+
+ for (VPInstruction *VPI : MemOps) {
+ Instruction *Instr = cast<Instruction>(VPI->getUnderlyingValue());
+ RecipeBuilder.getVPBuilder().setInsertPoint(VPI);
+
+ auto ReplaceWith = [&](VPRecipeBase *New) {
+ RecipeBuilder.setRecipe(Instr, New);
+ RecipeBuilder.getVPBuilder().insert(New);
+ if (VPI->getOpcode() == Instruction::Load)
+ VPI->replaceAllUsesWith(New->getVPSingleValue());
+ VPI->eraseFromParent();
+ };
+
+ // The stores with invariant address inside the loop will be deleted, and
+ // in the exit block, a uniform store recipe will be created for the final
+ // invariant store of the reduction.
+ StoreInst *SI;
+ if ((SI = dyn_cast<StoreInst>(Instr)) &&
+ Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
+ // Only create recipe for the final invariant store of the reduction.
+ if (Legal->isInvariantStoreOfReduction(SI)) {
+ auto *Recipe = new VPReplicateRecipe(
+ SI, VPI->operandsWithoutMask(), true /* IsUniform */,
+ nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
+ Recipe->insertBefore(*MiddleVPBB, MBIP);
+ }
+ VPI->eraseFromParent();
+ continue;
+ }
+
+ if (VPI->getOpcode() == Instruction::Store)
+ if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr))) {
+ ReplaceWith(RecipeBuilder.tryToWidenHistogram(*HistInfo, VPI));
+ continue;
+ }
+
+ VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
+ if (!Recipe)
+ Recipe = RecipeBuilder.handleReplication(cast<VPInstruction>(VPI), Range);
+
+ ReplaceWith(Recipe);
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 64315df74dda5..0c261373e4e1b 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -52,11 +52,6 @@ class VPRecipeBuilder {
/// Range. The function should not be called for memory instructions or calls.
bool shouldWiden(Instruction *I, VFRange &Range) const;
- /// Check if the load or store instruction \p VPI should widened for \p
- /// Range.Start and potentially masked. Such instructions are handled by a
- /// recipe that takes an additional VPInstruction for the mask.
- VPRecipeBase *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
-
/// Optimize the special case where the operand of \p VPI is a constant
/// integer induction variable.
VPWidenIntOrFpInductionRecipe *
@@ -72,24 +67,31 @@ class VPRecipeBuilder {
/// cost-model indicates that widening should be performed.
VPWidenRecipe *tryToWiden(VPInstruction *VPI);
- /// Makes Histogram count operations safe for vectorization, by emitting a
- /// llvm.experimental.vector.histogram.add intrinsic in place of the
- /// Load + Add|Sub + Store operations that perform the histogram in the
- /// original scalar loop.
- VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
- VPInstruction *VPI);
-
public:
VPRecipeBuilder(VPlan &Plan, const TargetLibraryInfo *TLI,
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM, VPBuilder &Builder)
: Plan(Plan), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {}
+ VPBuilder &getVPBuilder() const { return Builder; }
+
/// Create and return a widened recipe for a non-phi recipe \p R if one can be
/// created within the given VF \p Range.
VPRecipeBase *tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
VFRange &Range);
+ /// Check if the load or store instruction \p VPI should widened for \p
+ /// Range.Start and potentially masked. Such instructions are handled by a
+ /// recipe that takes an additional VPInstruction for the mask.
+ VPRecipeBase *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
+
+ /// Makes Histogram count operations safe for vectorization, by emitting a
+ /// llvm.experimental.vector.histogram.add intrinsic in place of the
+ /// Load + Add|Sub + Store operations that perform the histogram in the
+ /// original scalar loop.
+ VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
+ VPInstruction *VPI);
+
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index f2dfc166cecc9..787a687f19cdd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -483,6 +483,10 @@ struct VPlanTransforms {
/// are only valid for a subset of VFs in Range, Range.End is updated.
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx,
VFRange &Range);
+
+ static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
+ VPRecipeBuilder &RecipeBuilder,
+ VPCostContext &CostCtx);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
index d84a6e27e5473..92d9a6e42fd28 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -179,8 +179,8 @@ for.end:
; Cost of store:
; store(4) / 2 = 2
;
-; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
+; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp2, ptr %tmp0, align 4
; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp2 = add nsw i32 %tmp1, %x
;
@@ -229,10 +229,11 @@ for.end:
; store(4) / 2 = 2
;
; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
+; CHECK: Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
+; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
; CHECK: Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x
-; CHECK: Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp3 = sdiv i32 %tmp1, %tmp2
; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp3, %tmp2
; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp5, ptr %tmp0, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
index bc9367942ac27..8617788c90584 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
@@ -5,6 +5,7 @@
; CHECK: VPlan for loop in 'foo' after printAfterInitialConstruction
; CHECK: VPlan for loop in 'foo' after VPlanTransforms::introduceMasksAndLinearize
+; CHECK: VPlan for loop in 'foo' after VPlanTransforms::makeMemOpWideningDecisions
; CHECK: VPlan for loop in 'foo' after VPlanTransforms::clearReductionWrapFlags
; CHECK: VPlan for loop in 'foo' after VPlanTransforms::optimizeFindIVReductions
; CHECK: VPlan for loop in 'foo' after VPlanTransforms::handleMultiUseReductions
``````````
</details>
https://github.com/llvm/llvm-project/pull/182592
More information about the llvm-commits
mailing list