[llvm] [NFCI][VPlan] Split initial mem-widening into a separate transformation (PR #182592)
Andrei Elovikov via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 3 15:49:35 PST 2026
https://github.com/eas updated https://github.com/llvm/llvm-project/pull/182592
>From c535af10c156db5095db2d27c1fd4a749722e89e Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Tue, 17 Feb 2026 13:29:13 -0800
Subject: [PATCH 1/6] [NFCI][VPlan] Split initial mem-widening into a separate
transformation
Preparation change before implementing stride-multiversioning as a
VPlan-based transformation. Might help
https://github.com/llvm/llvm-project/pull/147297/ as well.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 123 +++++++++++++-----
.../Transforms/Vectorize/VPRecipeBuilder.h | 26 ++--
.../Transforms/Vectorize/VPlanTransforms.h | 4 +
.../AArch64/predication_costs.ll | 5 +-
.../VPlan/vplan-print-after-all.ll | 1 +
5 files changed, 114 insertions(+), 45 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 36c8c0560c9eb..3ad09570f2106 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8067,13 +8067,9 @@ VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
return tryToWidenCall(VPI, Range);
Instruction *Instr = R->getUnderlyingInstr();
- if (VPI->getOpcode() == Instruction::Store)
- if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr)))
- return tryToWidenHistogram(*HistInfo, VPI);
-
- if (VPI->getOpcode() == Instruction::Load ||
- VPI->getOpcode() == Instruction::Store)
- return tryToWidenMemory(VPI, Range);
+ assert(!is_contained({Instruction::Load, Instruction::Store},
+ VPI->getOpcode()) &&
+ "Should have been handled prior to this!");
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8244,9 +8240,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
HeaderVPBB);
- auto *MiddleVPBB = Plan->getMiddleBlock();
- VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
-
// Collect blocks that need predication for in-loop reduction recipes.
DenseSet<BasicBlock *> BlocksNeedingPredication;
for (BasicBlock *BB : OrigLoop->blocks())
@@ -8256,13 +8249,23 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPlanTransforms::createInLoopReductionRecipes(*Plan, BlocksNeedingPredication,
Range.Start);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
+ OrigLoop);
+
+ RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::makeMemOpWideningDecisions, *Plan,
+ Range, RecipeBuilder, CostCtx);
+
// Now process all other blocks and instructions.
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
// Convert input VPInstructions to widened recipes.
for (VPRecipeBase &R : make_early_inc_range(
make_range(VPBB->getFirstNonPhi(), VPBB->end()))) {
- // Skip recipes that do not need transforming.
- if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe>(&R))
+ // Skip recipes that do not need transforming or have already been
+ // transformed.
+ if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe,
+ VPReplicateRecipe, VPWidenLoadRecipe, VPWidenStoreRecipe,
+ VPVectorPointerRecipe, VPVectorEndPointerRecipe,
+ VPHistogramRecipe>(&R))
continue;
auto *VPI = cast<VPInstruction>(&R);
if (!VPI->getUnderlyingValue())
@@ -8274,23 +8277,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
Instruction *Instr = cast<Instruction>(VPI->getUnderlyingValue());
Builder.setInsertPoint(VPI);
- // The stores with invariant address inside the loop will be deleted, and
- // in the exit block, a uniform store recipe will be created for the final
- // invariant store of the reduction.
- StoreInst *SI;
- if ((SI = dyn_cast<StoreInst>(Instr)) &&
- Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
- // Only create recipe for the final invariant store of the reduction.
- if (Legal->isInvariantStoreOfReduction(SI)) {
- auto *Recipe = new VPReplicateRecipe(
- SI, VPI->operandsWithoutMask(), true /* IsUniform */,
- nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
- Recipe->insertBefore(*MiddleVPBB, MBIP);
- }
- R.eraseFromParent();
- continue;
- }
-
VPRecipeBase *Recipe =
RecipeBuilder.tryToCreateWidenNonPhiRecipe(VPI, Range);
if (!Recipe)
@@ -8358,8 +8344,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
- OrigLoop);
RUN_VPLAN_PASS(VPlanTransforms::createPartialReductions, *Plan, CostCtx,
Range);
RUN_VPLAN_PASS(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx,
@@ -9997,3 +9981,80 @@ void LoopVectorizePass::printPipeline(
OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
OS << '>';
}
+
+void VPlanTransforms::makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
+ VPRecipeBuilder &RecipeBuilder,
+ VPCostContext &CostCtx) {
+ // Filter out scalar VPlan.
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) { return VF.isScalar(); }, Range))
+ return;
+
+ // Scan the body of the loop in a topological order to visit each basic block
+ // after having visited its predecessor basic blocks.
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ HeaderVPBB);
+
+ // Collect all loads/stores first. We will start with ones having simpler
+ // decisions followed by more complex ones that are potentially
+ // guided/dependent on the simpler ones.
+ SmallVector<VPInstruction *> MemOps;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ for (VPRecipeBase &R : *VPBB) {
+ auto *VPI = dyn_cast<VPInstruction>(&R);
+ if (VPI && VPI->getUnderlyingValue() &&
+ is_contained({Instruction::Load, Instruction::Store},
+ VPI->getOpcode()))
+ MemOps.push_back(VPI);
+ }
+ }
+
+ auto *Legal = CostCtx.CM.Legal;
+
+ auto *MiddleVPBB = Plan.getMiddleBlock();
+ VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
+
+ for (VPInstruction *VPI : MemOps) {
+ Instruction *Instr = cast<Instruction>(VPI->getUnderlyingValue());
+ RecipeBuilder.getVPBuilder().setInsertPoint(VPI);
+
+ auto ReplaceWith = [&](VPRecipeBase *New) {
+ RecipeBuilder.setRecipe(Instr, New);
+ RecipeBuilder.getVPBuilder().insert(New);
+ if (VPI->getOpcode() == Instruction::Load)
+ VPI->replaceAllUsesWith(New->getVPSingleValue());
+ VPI->eraseFromParent();
+ };
+
+ // The stores with invariant address inside the loop will be deleted, and
+ // in the exit block, a uniform store recipe will be created for the final
+ // invariant store of the reduction.
+ StoreInst *SI;
+ if ((SI = dyn_cast<StoreInst>(Instr)) &&
+ Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
+ // Only create recipe for the final invariant store of the reduction.
+ if (Legal->isInvariantStoreOfReduction(SI)) {
+ auto *Recipe = new VPReplicateRecipe(
+ SI, VPI->operandsWithoutMask(), true /* IsUniform */,
+ nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
+ Recipe->insertBefore(*MiddleVPBB, MBIP);
+ }
+ VPI->eraseFromParent();
+ continue;
+ }
+
+ if (VPI->getOpcode() == Instruction::Store)
+ if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr))) {
+ ReplaceWith(RecipeBuilder.tryToWidenHistogram(*HistInfo, VPI));
+ continue;
+ }
+
+ VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
+ if (!Recipe)
+ Recipe = RecipeBuilder.handleReplication(cast<VPInstruction>(VPI), Range);
+
+ ReplaceWith(Recipe);
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 64315df74dda5..0c261373e4e1b 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -52,11 +52,6 @@ class VPRecipeBuilder {
/// Range. The function should not be called for memory instructions or calls.
bool shouldWiden(Instruction *I, VFRange &Range) const;
- /// Check if the load or store instruction \p VPI should widened for \p
- /// Range.Start and potentially masked. Such instructions are handled by a
- /// recipe that takes an additional VPInstruction for the mask.
- VPRecipeBase *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
-
/// Optimize the special case where the operand of \p VPI is a constant
/// integer induction variable.
VPWidenIntOrFpInductionRecipe *
@@ -72,24 +67,31 @@ class VPRecipeBuilder {
/// cost-model indicates that widening should be performed.
VPWidenRecipe *tryToWiden(VPInstruction *VPI);
- /// Makes Histogram count operations safe for vectorization, by emitting a
- /// llvm.experimental.vector.histogram.add intrinsic in place of the
- /// Load + Add|Sub + Store operations that perform the histogram in the
- /// original scalar loop.
- VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
- VPInstruction *VPI);
-
public:
VPRecipeBuilder(VPlan &Plan, const TargetLibraryInfo *TLI,
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM, VPBuilder &Builder)
: Plan(Plan), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {}
+ VPBuilder &getVPBuilder() const { return Builder; }
+
/// Create and return a widened recipe for a non-phi recipe \p R if one can be
/// created within the given VF \p Range.
VPRecipeBase *tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
VFRange &Range);
+ /// Check if the load or store instruction \p VPI should widened for \p
+ /// Range.Start and potentially masked. Such instructions are handled by a
+ /// recipe that takes an additional VPInstruction for the mask.
+ VPRecipeBase *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
+
+ /// Makes Histogram count operations safe for vectorization, by emitting a
+ /// llvm.experimental.vector.histogram.add intrinsic in place of the
+ /// Load + Add|Sub + Store operations that perform the histogram in the
+ /// original scalar loop.
+ VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
+ VPInstruction *VPI);
+
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 16f7ae2daeb5e..38df4468afa2b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -479,6 +479,10 @@ struct VPlanTransforms {
/// are only valid for a subset of VFs in Range, Range.End is updated.
static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx,
VFRange &Range);
+
+ static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
+ VPRecipeBuilder &RecipeBuilder,
+ VPCostContext &CostCtx);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
index d84a6e27e5473..92d9a6e42fd28 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -179,8 +179,8 @@ for.end:
; Cost of store:
; store(4) / 2 = 2
;
-; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
+; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp2, ptr %tmp0, align 4
; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp2 = add nsw i32 %tmp1, %x
;
@@ -229,10 +229,11 @@ for.end:
; store(4) / 2 = 2
;
; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
+; CHECK: Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
+; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
; CHECK: Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x
-; CHECK: Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp3 = sdiv i32 %tmp1, %tmp2
; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp3, %tmp2
; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp5, ptr %tmp0, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
index bc9367942ac27..8617788c90584 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
@@ -5,6 +5,7 @@
; CHECK: VPlan for loop in 'foo' after printAfterInitialConstruction
; CHECK: VPlan for loop in 'foo' after VPlanTransforms::introduceMasksAndLinearize
+; CHECK: VPlan for loop in 'foo' after VPlanTransforms::makeMemOpWideningDecisions
; CHECK: VPlan for loop in 'foo' after VPlanTransforms::clearReductionWrapFlags
; CHECK: VPlan for loop in 'foo' after VPlanTransforms::optimizeFindIVReductions
; CHECK: VPlan for loop in 'foo' after VPlanTransforms::handleMultiUseReductions
>From 10319e792dcb7c5e0d09fcb709b12bbf7817de2a Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Mon, 23 Feb 2026 09:04:23 -0800
Subject: [PATCH 2/6] Don't make unnecessary captures
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3ad09570f2106..47e2f6b845fd2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9987,7 +9987,7 @@ void VPlanTransforms::makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
VPCostContext &CostCtx) {
// Filter out scalar VPlan.
if (LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](ElementCount VF) { return VF.isScalar(); }, Range))
+ [](ElementCount VF) { return VF.isScalar(); }, Range))
return;
// Scan the body of the loop in a topological order to visit each basic block
>From 7f998a9b2c80c90ca100261ddd352a1a96c426b6 Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Mon, 23 Feb 2026 13:58:01 -0800
Subject: [PATCH 3/6] Move to VPlanTransforms, have to pass Legal explicitly
---
.../Transforms/Vectorize/LoopVectorize.cpp | 79 +------------------
.../Transforms/Vectorize/VPlanTransforms.cpp | 76 ++++++++++++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 4 +-
3 files changed, 80 insertions(+), 79 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 47e2f6b845fd2..14f533e491b0c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8253,7 +8253,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
OrigLoop);
RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::makeMemOpWideningDecisions, *Plan,
- Range, RecipeBuilder, CostCtx);
+ Range, RecipeBuilder, CostCtx, *CM.Legal);
// Now process all other blocks and instructions.
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
@@ -9981,80 +9981,3 @@ void LoopVectorizePass::printPipeline(
OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
OS << '>';
}
-
-void VPlanTransforms::makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
- VPRecipeBuilder &RecipeBuilder,
- VPCostContext &CostCtx) {
- // Filter out scalar VPlan.
- if (LoopVectorizationPlanner::getDecisionAndClampRange(
- [](ElementCount VF) { return VF.isScalar(); }, Range))
- return;
-
- // Scan the body of the loop in a topological order to visit each basic block
- // after having visited its predecessor basic blocks.
- VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
- VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
- ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
- HeaderVPBB);
-
- // Collect all loads/stores first. We will start with ones having simpler
- // decisions followed by more complex ones that are potentially
- // guided/dependent on the simpler ones.
- SmallVector<VPInstruction *> MemOps;
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
- for (VPRecipeBase &R : *VPBB) {
- auto *VPI = dyn_cast<VPInstruction>(&R);
- if (VPI && VPI->getUnderlyingValue() &&
- is_contained({Instruction::Load, Instruction::Store},
- VPI->getOpcode()))
- MemOps.push_back(VPI);
- }
- }
-
- auto *Legal = CostCtx.CM.Legal;
-
- auto *MiddleVPBB = Plan.getMiddleBlock();
- VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
-
- for (VPInstruction *VPI : MemOps) {
- Instruction *Instr = cast<Instruction>(VPI->getUnderlyingValue());
- RecipeBuilder.getVPBuilder().setInsertPoint(VPI);
-
- auto ReplaceWith = [&](VPRecipeBase *New) {
- RecipeBuilder.setRecipe(Instr, New);
- RecipeBuilder.getVPBuilder().insert(New);
- if (VPI->getOpcode() == Instruction::Load)
- VPI->replaceAllUsesWith(New->getVPSingleValue());
- VPI->eraseFromParent();
- };
-
- // The stores with invariant address inside the loop will be deleted, and
- // in the exit block, a uniform store recipe will be created for the final
- // invariant store of the reduction.
- StoreInst *SI;
- if ((SI = dyn_cast<StoreInst>(Instr)) &&
- Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
- // Only create recipe for the final invariant store of the reduction.
- if (Legal->isInvariantStoreOfReduction(SI)) {
- auto *Recipe = new VPReplicateRecipe(
- SI, VPI->operandsWithoutMask(), true /* IsUniform */,
- nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
- Recipe->insertBefore(*MiddleVPBB, MBIP);
- }
- VPI->eraseFromParent();
- continue;
- }
-
- if (VPI->getOpcode() == Instruction::Store)
- if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr))) {
- ReplaceWith(RecipeBuilder.tryToWidenHistogram(*HistInfo, VPI));
- continue;
- }
-
- VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
- if (!Recipe)
- Recipe = RecipeBuilder.handleReplication(cast<VPInstruction>(VPI), Range);
-
- ReplaceWith(Recipe);
- }
-}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b8329bb396ff6..3c053089ff0af 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -42,6 +42,7 @@
#include "llvm/Support/TypeSize.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
using namespace llvm;
using namespace VPlanPatternMatch;
@@ -6274,3 +6275,78 @@ void VPlanTransforms::createPartialReductions(VPlan &Plan,
for (const VPPartialReductionChain &Chain : Chains)
transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
}
+
+void VPlanTransforms::makeMemOpWideningDecisions(
+ VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder,
+ VPCostContext &CostCtx, LoopVectorizationLegality &Legal) {
+ // Filter out scalar VPlan.
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(
+ [](ElementCount VF) { return VF.isScalar(); }, Range))
+ return;
+
+ // Scan the body of the loop in a topological order to visit each basic block
+ // after having visited its predecessor basic blocks.
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ HeaderVPBB);
+
+ // Collect all loads/stores first. We will start with ones having simpler
+ // decisions followed by more complex ones that are potentially
+ // guided/dependent on the simpler ones.
+ SmallVector<VPInstruction *> MemOps;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ for (VPRecipeBase &R : *VPBB) {
+ auto *VPI = dyn_cast<VPInstruction>(&R);
+ if (VPI && VPI->getUnderlyingValue() &&
+ is_contained({Instruction::Load, Instruction::Store},
+ VPI->getOpcode()))
+ MemOps.push_back(VPI);
+ }
+ }
+
+ auto *MiddleVPBB = Plan.getMiddleBlock();
+ VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
+
+ for (VPInstruction *VPI : MemOps) {
+ Instruction *Instr = cast<Instruction>(VPI->getUnderlyingValue());
+ RecipeBuilder.getVPBuilder().setInsertPoint(VPI);
+
+ auto ReplaceWith = [&](VPRecipeBase *New) {
+ RecipeBuilder.setRecipe(Instr, New);
+ RecipeBuilder.getVPBuilder().insert(New);
+ if (VPI->getOpcode() == Instruction::Load)
+ VPI->replaceAllUsesWith(New->getVPSingleValue());
+ VPI->eraseFromParent();
+ };
+
+ // The stores with invariant address inside the loop will be deleted, and
+ // in the exit block, a uniform store recipe will be created for the final
+ // invariant store of the reduction.
+ StoreInst *SI;
+ if ((SI = dyn_cast<StoreInst>(Instr)) &&
+ Legal.isInvariantAddressOfReduction(SI->getPointerOperand())) {
+ // Only create recipe for the final invariant store of the reduction.
+ if (Legal.isInvariantStoreOfReduction(SI)) {
+ auto *Recipe = new VPReplicateRecipe(
+ SI, VPI->operandsWithoutMask(), true /* IsUniform */,
+ nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
+ Recipe->insertBefore(*MiddleVPBB, MBIP);
+ }
+ VPI->eraseFromParent();
+ continue;
+ }
+
+ if (VPI->getOpcode() == Instruction::Store)
+ if (auto HistInfo = Legal.getHistogramInfo(cast<StoreInst>(Instr))) {
+ ReplaceWith(RecipeBuilder.tryToWidenHistogram(*HistInfo, VPI));
+ continue;
+ }
+
+ VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
+ if (!Recipe)
+ Recipe = RecipeBuilder.handleReplication(cast<VPInstruction>(VPI), Range);
+
+ ReplaceWith(Recipe);
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 38df4468afa2b..55973b0bd1ccd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -26,6 +26,7 @@ namespace llvm {
class InductionDescriptor;
class Instruction;
class Loop;
+class LoopVectorizationLegality;
class LoopVersioning;
class OptimizationRemarkEmitter;
class PHINode;
@@ -482,7 +483,8 @@ struct VPlanTransforms {
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
VPRecipeBuilder &RecipeBuilder,
- VPCostContext &CostCtx);
+ VPCostContext &CostCtx,
+ LoopVectorizationLegality &Legal);
};
} // namespace llvm
>From d1080c47e31ce7abf33808f2b709ba7ffd37061e Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Mon, 23 Feb 2026 14:00:58 -0800
Subject: [PATCH 4/6] Braces for outer `if`
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3c053089ff0af..a932c4f7ccf11 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -6337,11 +6337,12 @@ void VPlanTransforms::makeMemOpWideningDecisions(
continue;
}
- if (VPI->getOpcode() == Instruction::Store)
+ if (VPI->getOpcode() == Instruction::Store) {
if (auto HistInfo = Legal.getHistogramInfo(cast<StoreInst>(Instr))) {
ReplaceWith(RecipeBuilder.tryToWidenHistogram(*HistInfo, VPI));
continue;
}
+ }
VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
if (!Recipe)
>From 6f6d71083c9e76a5770278169fa70283931b3183 Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Tue, 3 Mar 2026 13:42:26 -0800
Subject: [PATCH 5/6] Fold one `Legal` use into `tryToWidenHistogram` renamed
to `widenIfHistogram`
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 9 +++++++--
llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h | 12 ++++++------
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 ++--
3 files changed, 15 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 14f533e491b0c..2e1974f9543cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7960,8 +7960,13 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
};
}
-VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
- VPInstruction *VPI) {
+VPHistogramRecipe *VPRecipeBuilder::widenIfHistogram(VPInstruction *VPI) {
+ auto HistInfo =
+ Legal->getHistogramInfo(cast<StoreInst>(VPI->getUnderlyingInstr()));
+ if (!HistInfo)
+ return nullptr;
+
+ const HistogramInfo *HI = *HistInfo;
// FIXME: Support other operations.
unsigned Opcode = HI->Update->getOpcode();
assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 0c261373e4e1b..080151e7cd2cf 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -85,12 +85,12 @@ class VPRecipeBuilder {
/// recipe that takes an additional VPInstruction for the mask.
VPRecipeBase *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
- /// Makes Histogram count operations safe for vectorization, by emitting a
- /// llvm.experimental.vector.histogram.add intrinsic in place of the
- /// Load + Add|Sub + Store operations that perform the histogram in the
- /// original scalar loop.
- VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
- VPInstruction *VPI);
+ /// If \p VPI represents a histogram operation (as determined by
+ /// LoopVectorizationLegality) make that safe for vectorization, by emitting a
+ /// llvm.experimental.vector.histogram.add intrinsic in place of the Load +
+ /// Add|Sub + Store operations that perform the histogram in the original
+ /// scalar loop.
+ VPHistogramRecipe *widenIfHistogram(VPInstruction *VPI);
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a932c4f7ccf11..93f80157aaf31 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -6338,8 +6338,8 @@ void VPlanTransforms::makeMemOpWideningDecisions(
}
if (VPI->getOpcode() == Instruction::Store) {
- if (auto HistInfo = Legal.getHistogramInfo(cast<StoreInst>(Instr))) {
- ReplaceWith(RecipeBuilder.tryToWidenHistogram(*HistInfo, VPI));
+ if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
+ ReplaceWith(Histogram);
continue;
}
}
>From e95f9bb052aae2b4d6a693c84bb10d4077fe5da1 Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Tue, 3 Mar 2026 15:37:00 -0800
Subject: [PATCH 6/6] Move another `Legal` use to
`VPRecipeBuilder::replaceWithFinalIfReductionStore`
---
.../Transforms/Vectorize/LoopVectorize.cpp | 22 ++++++++++++-
.../Transforms/Vectorize/VPRecipeBuilder.h | 8 +++++
.../Transforms/Vectorize/VPlanTransforms.cpp | 31 ++++++-------------
.../Transforms/Vectorize/VPlanTransforms.h | 4 +--
4 files changed, 40 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2e1974f9543cd..e90f3975786d4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7986,6 +7986,26 @@ VPHistogramRecipe *VPRecipeBuilder::widenIfHistogram(VPInstruction *VPI) {
return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
}
+bool VPRecipeBuilder::replaceWithFinalIfReductionStore(
+ VPBuilder &FinalRedStoresBuilder, VPInstruction *VPI) {
+ StoreInst *SI;
+ if ((SI = dyn_cast<StoreInst>(VPI->getUnderlyingInstr())) &&
+ Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
+ // Only create recipe for the final invariant store of the reduction.
+ if (Legal->isInvariantStoreOfReduction(SI)) {
+ auto *Recipe = new VPReplicateRecipe(
+ SI, VPI->operandsWithoutMask(), true /* IsUniform */,
+ nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
+ FinalRedStoresBuilder.insert(Recipe);
+ // Recipe->insertBefore(*MiddleVPBB, MBIP);
+ }
+ VPI->eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
VFRange &Range) {
auto *I = VPI->getUnderlyingInstr();
@@ -8258,7 +8278,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
OrigLoop);
RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::makeMemOpWideningDecisions, *Plan,
- Range, RecipeBuilder, CostCtx, *CM.Legal);
+ Range, RecipeBuilder, CostCtx);
// Now process all other blocks and instructions.
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 080151e7cd2cf..a908c25de3fd5 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -92,6 +92,14 @@ class VPRecipeBuilder {
/// scalar loop.
VPHistogramRecipe *widenIfHistogram(VPInstruction *VPI);
+ /// The stores with invariant address inside the loop will be deleted, and in
+ /// the exit block, a uniform store recipe will be created for the final
+ /// invariant store of the reduction. Returns `true` if replacement took
+ /// place. The order of stores must be preserved, hence \p
+ /// FinalRedStoresBuidler.
+ bool replaceWithFinalIfReductionStore(VPBuilder &FinalRedStoresBuilder,
+ VPInstruction *VPI);
+
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 93f80157aaf31..137da0fa4267d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -42,7 +42,6 @@
#include "llvm/Support/TypeSize.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
using namespace llvm;
using namespace VPlanPatternMatch;
@@ -6276,16 +6275,19 @@ void VPlanTransforms::createPartialReductions(VPlan &Plan,
transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
}
-void VPlanTransforms::makeMemOpWideningDecisions(
- VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder,
- VPCostContext &CostCtx, LoopVectorizationLegality &Legal) {
+void VPlanTransforms::makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
+ VPRecipeBuilder &RecipeBuilder,
+ VPCostContext &CostCtx) {
// Filter out scalar VPlan.
if (LoopVectorizationPlanner::getDecisionAndClampRange(
[](ElementCount VF) { return VF.isScalar(); }, Range))
return;
// Scan the body of the loop in a topological order to visit each basic block
- // after having visited its predecessor basic blocks.
+ // after having visited its predecessor basic blocks. This is necessary
+ // because we need to preserve the order of the reduction stores into
+ // invariant address when transforming those to a scalar store outside the
+ // vector loop body.
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
@@ -6306,7 +6308,7 @@ void VPlanTransforms::makeMemOpWideningDecisions(
}
auto *MiddleVPBB = Plan.getMiddleBlock();
- VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
+ VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
for (VPInstruction *VPI : MemOps) {
Instruction *Instr = cast<Instruction>(VPI->getUnderlyingValue());
@@ -6320,22 +6322,9 @@ void VPlanTransforms::makeMemOpWideningDecisions(
VPI->eraseFromParent();
};
- // The stores with invariant address inside the loop will be deleted, and
- // in the exit block, a uniform store recipe will be created for the final
- // invariant store of the reduction.
- StoreInst *SI;
- if ((SI = dyn_cast<StoreInst>(Instr)) &&
- Legal.isInvariantAddressOfReduction(SI->getPointerOperand())) {
- // Only create recipe for the final invariant store of the reduction.
- if (Legal.isInvariantStoreOfReduction(SI)) {
- auto *Recipe = new VPReplicateRecipe(
- SI, VPI->operandsWithoutMask(), true /* IsUniform */,
- nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
- Recipe->insertBefore(*MiddleVPBB, MBIP);
- }
- VPI->eraseFromParent();
+ if (RecipeBuilder.replaceWithFinalIfReductionStore(FinalRedStoresBuilder,
+ VPI))
continue;
- }
if (VPI->getOpcode() == Instruction::Store) {
if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 55973b0bd1ccd..38df4468afa2b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -26,7 +26,6 @@ namespace llvm {
class InductionDescriptor;
class Instruction;
class Loop;
-class LoopVectorizationLegality;
class LoopVersioning;
class OptimizationRemarkEmitter;
class PHINode;
@@ -483,8 +482,7 @@ struct VPlanTransforms {
static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
VPRecipeBuilder &RecipeBuilder,
- VPCostContext &CostCtx,
- LoopVectorizationLegality &Legal);
+ VPCostContext &CostCtx);
};
} // namespace llvm
More information about the llvm-commits
mailing list