[llvm] [NFCI][VPlan] Split initial mem-widening into a separate transformation (PR #182592)

Andrei Elovikov via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 3 15:49:35 PST 2026


https://github.com/eas updated https://github.com/llvm/llvm-project/pull/182592

>From c535af10c156db5095db2d27c1fd4a749722e89e Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Tue, 17 Feb 2026 13:29:13 -0800
Subject: [PATCH 1/6] [NFCI][VPlan] Split initial mem-widening into a separate
 transformation

Preparation change before implementing stride-multiversioning as a
VPlan-based transformation. Might help
https://github.com/llvm/llvm-project/pull/147297/ as well.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 123 +++++++++++++-----
 .../Transforms/Vectorize/VPRecipeBuilder.h    |  26 ++--
 .../Transforms/Vectorize/VPlanTransforms.h    |   4 +
 .../AArch64/predication_costs.ll              |   5 +-
 .../VPlan/vplan-print-after-all.ll            |   1 +
 5 files changed, 114 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 36c8c0560c9eb..3ad09570f2106 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8067,13 +8067,9 @@ VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
     return tryToWidenCall(VPI, Range);
 
   Instruction *Instr = R->getUnderlyingInstr();
-  if (VPI->getOpcode() == Instruction::Store)
-    if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr)))
-      return tryToWidenHistogram(*HistInfo, VPI);
-
-  if (VPI->getOpcode() == Instruction::Load ||
-      VPI->getOpcode() == Instruction::Store)
-    return tryToWidenMemory(VPI, Range);
+  assert(!is_contained({Instruction::Load, Instruction::Store},
+                       VPI->getOpcode()) &&
+         "Should have been handled prior to this!");
 
   if (!shouldWiden(Instr, Range))
     return nullptr;
@@ -8244,9 +8240,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       HeaderVPBB);
 
-  auto *MiddleVPBB = Plan->getMiddleBlock();
-  VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
-
   // Collect blocks that need predication for in-loop reduction recipes.
   DenseSet<BasicBlock *> BlocksNeedingPredication;
   for (BasicBlock *BB : OrigLoop->blocks())
@@ -8256,13 +8249,23 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   VPlanTransforms::createInLoopReductionRecipes(*Plan, BlocksNeedingPredication,
                                                 Range.Start);
 
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
+                        OrigLoop);
+
+  RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::makeMemOpWideningDecisions, *Plan,
+                           Range, RecipeBuilder, CostCtx);
+
   // Now process all other blocks and instructions.
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
     // Convert input VPInstructions to widened recipes.
     for (VPRecipeBase &R : make_early_inc_range(
              make_range(VPBB->getFirstNonPhi(), VPBB->end()))) {
-      // Skip recipes that do not need transforming.
-      if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe>(&R))
+      // Skip recipes that do not need transforming or have already been
+      // transformed.
+      if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe,
+              VPReplicateRecipe, VPWidenLoadRecipe, VPWidenStoreRecipe,
+              VPVectorPointerRecipe, VPVectorEndPointerRecipe,
+              VPHistogramRecipe>(&R))
         continue;
       auto *VPI = cast<VPInstruction>(&R);
       if (!VPI->getUnderlyingValue())
@@ -8274,23 +8277,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
       Instruction *Instr = cast<Instruction>(VPI->getUnderlyingValue());
       Builder.setInsertPoint(VPI);
 
-      // The stores with invariant address inside the loop will be deleted, and
-      // in the exit block, a uniform store recipe will be created for the final
-      // invariant store of the reduction.
-      StoreInst *SI;
-      if ((SI = dyn_cast<StoreInst>(Instr)) &&
-          Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
-        // Only create recipe for the final invariant store of the reduction.
-        if (Legal->isInvariantStoreOfReduction(SI)) {
-          auto *Recipe = new VPReplicateRecipe(
-              SI, VPI->operandsWithoutMask(), true /* IsUniform */,
-              nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
-          Recipe->insertBefore(*MiddleVPBB, MBIP);
-        }
-        R.eraseFromParent();
-        continue;
-      }
-
       VPRecipeBase *Recipe =
           RecipeBuilder.tryToCreateWidenNonPhiRecipe(VPI, Range);
       if (!Recipe)
@@ -8358,8 +8344,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // TODO: Enable following transform when the EVL-version of extended-reduction
   // and mulacc-reduction are implemented.
   if (!CM.foldTailWithEVL()) {
-    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
-                          OrigLoop);
     RUN_VPLAN_PASS(VPlanTransforms::createPartialReductions, *Plan, CostCtx,
                    Range);
     RUN_VPLAN_PASS(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx,
@@ -9997,3 +9981,80 @@ void LoopVectorizePass::printPipeline(
   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
   OS << '>';
 }
+
+void VPlanTransforms::makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
+                                                 VPRecipeBuilder &RecipeBuilder,
+                                                 VPCostContext &CostCtx) {
+  // Filter out scalar VPlan.
+  if (LoopVectorizationPlanner::getDecisionAndClampRange(
+          [&](ElementCount VF) { return VF.isScalar(); }, Range))
+    return;
+
+  // Scan the body of the loop in a topological order to visit each basic block
+  // after having visited its predecessor basic blocks.
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      HeaderVPBB);
+
+  // Collect all loads/stores first. We will start with ones having simpler
+  // decisions followed by more complex ones that are potentially
+  // guided/dependent on the simpler ones.
+  SmallVector<VPInstruction *> MemOps;
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+    for (VPRecipeBase &R : *VPBB) {
+      auto *VPI = dyn_cast<VPInstruction>(&R);
+      if (VPI && VPI->getUnderlyingValue() &&
+          is_contained({Instruction::Load, Instruction::Store},
+                       VPI->getOpcode()))
+        MemOps.push_back(VPI);
+    }
+  }
+
+  auto *Legal = CostCtx.CM.Legal;
+
+  auto *MiddleVPBB = Plan.getMiddleBlock();
+  VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
+
+  for (VPInstruction *VPI : MemOps) {
+    Instruction *Instr = cast<Instruction>(VPI->getUnderlyingValue());
+    RecipeBuilder.getVPBuilder().setInsertPoint(VPI);
+
+    auto ReplaceWith = [&](VPRecipeBase *New) {
+      RecipeBuilder.setRecipe(Instr, New);
+      RecipeBuilder.getVPBuilder().insert(New);
+      if (VPI->getOpcode() == Instruction::Load)
+        VPI->replaceAllUsesWith(New->getVPSingleValue());
+      VPI->eraseFromParent();
+    };
+
+    // The stores with invariant address inside the loop will be deleted, and
+    // in the exit block, a uniform store recipe will be created for the final
+    // invariant store of the reduction.
+    StoreInst *SI;
+    if ((SI = dyn_cast<StoreInst>(Instr)) &&
+        Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
+      // Only create recipe for the final invariant store of the reduction.
+      if (Legal->isInvariantStoreOfReduction(SI)) {
+        auto *Recipe = new VPReplicateRecipe(
+            SI, VPI->operandsWithoutMask(), true /* IsUniform */,
+            nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
+        Recipe->insertBefore(*MiddleVPBB, MBIP);
+      }
+      VPI->eraseFromParent();
+      continue;
+    }
+
+    if (VPI->getOpcode() == Instruction::Store)
+      if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr))) {
+        ReplaceWith(RecipeBuilder.tryToWidenHistogram(*HistInfo, VPI));
+        continue;
+      }
+
+    VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
+    if (!Recipe)
+      Recipe = RecipeBuilder.handleReplication(cast<VPInstruction>(VPI), Range);
+
+    ReplaceWith(Recipe);
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 64315df74dda5..0c261373e4e1b 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -52,11 +52,6 @@ class VPRecipeBuilder {
   /// Range. The function should not be called for memory instructions or calls.
   bool shouldWiden(Instruction *I, VFRange &Range) const;
 
-  /// Check if the load or store instruction \p VPI should widened for \p
-  /// Range.Start and potentially masked. Such instructions are handled by a
-  /// recipe that takes an additional VPInstruction for the mask.
-  VPRecipeBase *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
-
   /// Optimize the special case where the operand of \p VPI is a constant
   /// integer induction variable.
   VPWidenIntOrFpInductionRecipe *
@@ -72,24 +67,31 @@ class VPRecipeBuilder {
   /// cost-model indicates that widening should be performed.
   VPWidenRecipe *tryToWiden(VPInstruction *VPI);
 
-  /// Makes Histogram count operations safe for vectorization, by emitting a
-  /// llvm.experimental.vector.histogram.add intrinsic in place of the
-  /// Load + Add|Sub + Store operations that perform the histogram in the
-  /// original scalar loop.
-  VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
-                                         VPInstruction *VPI);
-
 public:
   VPRecipeBuilder(VPlan &Plan, const TargetLibraryInfo *TLI,
                   LoopVectorizationLegality *Legal,
                   LoopVectorizationCostModel &CM, VPBuilder &Builder)
       : Plan(Plan), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {}
 
+  VPBuilder &getVPBuilder() const { return Builder; }
+
   /// Create and return a widened recipe for a non-phi recipe \p R if one can be
   /// created within the given VF \p Range.
   VPRecipeBase *tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
                                              VFRange &Range);
 
+  /// Check if the load or store instruction \p VPI should widened for \p
+  /// Range.Start and potentially masked. Such instructions are handled by a
+  /// recipe that takes an additional VPInstruction for the mask.
+  VPRecipeBase *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
+
+  /// Makes Histogram count operations safe for vectorization, by emitting a
+  /// llvm.experimental.vector.histogram.add intrinsic in place of the
+  /// Load + Add|Sub + Store operations that perform the histogram in the
+  /// original scalar loop.
+  VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
+                                         VPInstruction *VPI);
+
   /// Set the recipe created for given ingredient.
   void setRecipe(Instruction *I, VPRecipeBase *R) {
     assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 16f7ae2daeb5e..38df4468afa2b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -479,6 +479,10 @@ struct VPlanTransforms {
   /// are only valid for a subset of VFs in Range, Range.End is updated.
   static void createPartialReductions(VPlan &Plan, VPCostContext &CostCtx,
                                       VFRange &Range);
+
+  static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
+                                         VPRecipeBuilder &RecipeBuilder,
+                                         VPCostContext &CostCtx);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
index d84a6e27e5473..92d9a6e42fd28 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -179,8 +179,8 @@ for.end:
 ; Cost of store:
 ;   store(4) / 2 = 2
 ;
-; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
 ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
+; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
 ; CHECK: Cost of 2 for VF 2: profitable to scalarize   store i32 %tmp2, ptr %tmp0, align 4
 ; CHECK: Cost of 3 for VF 2: profitable to scalarize   %tmp2 = add nsw i32 %tmp1, %x
 ;
@@ -229,10 +229,11 @@ for.end:
 ;   store(4) / 2 = 2
 ;
 ; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
+; CHECK:     Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
+; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
 ; CHECK:     Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2
 ; CHECK:     Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
 ; CHECK:     Scalarizing: %tmp5 = sub i32 %tmp4, %x
-; CHECK:     Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
 ; CHECK: Cost of 7 for VF 2: profitable to scalarize   %tmp3 = sdiv i32 %tmp1, %tmp2
 ; CHECK: Cost of 7 for VF 2: profitable to scalarize   %tmp4 = udiv i32 %tmp3, %tmp2
 ; CHECK: Cost of 2 for VF 2: profitable to scalarize   store i32 %tmp5, ptr %tmp0, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
index bc9367942ac27..8617788c90584 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
@@ -5,6 +5,7 @@
 
 ; CHECK: VPlan for loop in 'foo' after printAfterInitialConstruction
 ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::introduceMasksAndLinearize
+; CHECK: VPlan for loop in 'foo' after VPlanTransforms::makeMemOpWideningDecisions
 ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::clearReductionWrapFlags
 ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::optimizeFindIVReductions
 ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::handleMultiUseReductions

>From 10319e792dcb7c5e0d09fcb709b12bbf7817de2a Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Mon, 23 Feb 2026 09:04:23 -0800
Subject: [PATCH 2/6] Don't make unnecessary captures

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3ad09570f2106..47e2f6b845fd2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9987,7 +9987,7 @@ void VPlanTransforms::makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
                                                  VPCostContext &CostCtx) {
   // Filter out scalar VPlan.
   if (LoopVectorizationPlanner::getDecisionAndClampRange(
-          [&](ElementCount VF) { return VF.isScalar(); }, Range))
+          [](ElementCount VF) { return VF.isScalar(); }, Range))
     return;
 
   // Scan the body of the loop in a topological order to visit each basic block

>From 7f998a9b2c80c90ca100261ddd352a1a96c426b6 Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Mon, 23 Feb 2026 13:58:01 -0800
Subject: [PATCH 3/6] Move to VPlanTransforms, have to pass Legal explicitly

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 79 +------------------
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 76 ++++++++++++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |  4 +-
 3 files changed, 80 insertions(+), 79 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 47e2f6b845fd2..14f533e491b0c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8253,7 +8253,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
                         OrigLoop);
 
   RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::makeMemOpWideningDecisions, *Plan,
-                           Range, RecipeBuilder, CostCtx);
+                           Range, RecipeBuilder, CostCtx, *CM.Legal);
 
   // Now process all other blocks and instructions.
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
@@ -9981,80 +9981,3 @@ void LoopVectorizePass::printPipeline(
   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
   OS << '>';
 }
-
-void VPlanTransforms::makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
-                                                 VPRecipeBuilder &RecipeBuilder,
-                                                 VPCostContext &CostCtx) {
-  // Filter out scalar VPlan.
-  if (LoopVectorizationPlanner::getDecisionAndClampRange(
-          [](ElementCount VF) { return VF.isScalar(); }, Range))
-    return;
-
-  // Scan the body of the loop in a topological order to visit each basic block
-  // after having visited its predecessor basic blocks.
-  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
-  VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
-  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
-      HeaderVPBB);
-
-  // Collect all loads/stores first. We will start with ones having simpler
-  // decisions followed by more complex ones that are potentially
-  // guided/dependent on the simpler ones.
-  SmallVector<VPInstruction *> MemOps;
-  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
-    for (VPRecipeBase &R : *VPBB) {
-      auto *VPI = dyn_cast<VPInstruction>(&R);
-      if (VPI && VPI->getUnderlyingValue() &&
-          is_contained({Instruction::Load, Instruction::Store},
-                       VPI->getOpcode()))
-        MemOps.push_back(VPI);
-    }
-  }
-
-  auto *Legal = CostCtx.CM.Legal;
-
-  auto *MiddleVPBB = Plan.getMiddleBlock();
-  VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
-
-  for (VPInstruction *VPI : MemOps) {
-    Instruction *Instr = cast<Instruction>(VPI->getUnderlyingValue());
-    RecipeBuilder.getVPBuilder().setInsertPoint(VPI);
-
-    auto ReplaceWith = [&](VPRecipeBase *New) {
-      RecipeBuilder.setRecipe(Instr, New);
-      RecipeBuilder.getVPBuilder().insert(New);
-      if (VPI->getOpcode() == Instruction::Load)
-        VPI->replaceAllUsesWith(New->getVPSingleValue());
-      VPI->eraseFromParent();
-    };
-
-    // The stores with invariant address inside the loop will be deleted, and
-    // in the exit block, a uniform store recipe will be created for the final
-    // invariant store of the reduction.
-    StoreInst *SI;
-    if ((SI = dyn_cast<StoreInst>(Instr)) &&
-        Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
-      // Only create recipe for the final invariant store of the reduction.
-      if (Legal->isInvariantStoreOfReduction(SI)) {
-        auto *Recipe = new VPReplicateRecipe(
-            SI, VPI->operandsWithoutMask(), true /* IsUniform */,
-            nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
-        Recipe->insertBefore(*MiddleVPBB, MBIP);
-      }
-      VPI->eraseFromParent();
-      continue;
-    }
-
-    if (VPI->getOpcode() == Instruction::Store)
-      if (auto HistInfo = Legal->getHistogramInfo(cast<StoreInst>(Instr))) {
-        ReplaceWith(RecipeBuilder.tryToWidenHistogram(*HistInfo, VPI));
-        continue;
-      }
-
-    VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
-    if (!Recipe)
-      Recipe = RecipeBuilder.handleReplication(cast<VPInstruction>(VPI), Range);
-
-    ReplaceWith(Recipe);
-  }
-}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b8329bb396ff6..3c053089ff0af 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 
 using namespace llvm;
 using namespace VPlanPatternMatch;
@@ -6274,3 +6275,78 @@ void VPlanTransforms::createPartialReductions(VPlan &Plan,
     for (const VPPartialReductionChain &Chain : Chains)
       transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
 }
+
+void VPlanTransforms::makeMemOpWideningDecisions(
+    VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder,
+    VPCostContext &CostCtx, LoopVectorizationLegality &Legal) {
+  // Filter out scalar VPlan.
+  if (LoopVectorizationPlanner::getDecisionAndClampRange(
+          [](ElementCount VF) { return VF.isScalar(); }, Range))
+    return;
+
+  // Scan the body of the loop in a topological order to visit each basic block
+  // after having visited its predecessor basic blocks.
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      HeaderVPBB);
+
+  // Collect all loads/stores first. We will start with ones having simpler
+  // decisions followed by more complex ones that are potentially
+  // guided/dependent on the simpler ones.
+  SmallVector<VPInstruction *> MemOps;
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+    for (VPRecipeBase &R : *VPBB) {
+      auto *VPI = dyn_cast<VPInstruction>(&R);
+      if (VPI && VPI->getUnderlyingValue() &&
+          is_contained({Instruction::Load, Instruction::Store},
+                       VPI->getOpcode()))
+        MemOps.push_back(VPI);
+    }
+  }
+
+  auto *MiddleVPBB = Plan.getMiddleBlock();
+  VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
+
+  for (VPInstruction *VPI : MemOps) {
+    Instruction *Instr = cast<Instruction>(VPI->getUnderlyingValue());
+    RecipeBuilder.getVPBuilder().setInsertPoint(VPI);
+
+    auto ReplaceWith = [&](VPRecipeBase *New) {
+      RecipeBuilder.setRecipe(Instr, New);
+      RecipeBuilder.getVPBuilder().insert(New);
+      if (VPI->getOpcode() == Instruction::Load)
+        VPI->replaceAllUsesWith(New->getVPSingleValue());
+      VPI->eraseFromParent();
+    };
+
+    // The stores with invariant address inside the loop will be deleted, and
+    // in the exit block, a uniform store recipe will be created for the final
+    // invariant store of the reduction.
+    StoreInst *SI;
+    if ((SI = dyn_cast<StoreInst>(Instr)) &&
+        Legal.isInvariantAddressOfReduction(SI->getPointerOperand())) {
+      // Only create recipe for the final invariant store of the reduction.
+      if (Legal.isInvariantStoreOfReduction(SI)) {
+        auto *Recipe = new VPReplicateRecipe(
+            SI, VPI->operandsWithoutMask(), true /* IsUniform */,
+            nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
+        Recipe->insertBefore(*MiddleVPBB, MBIP);
+      }
+      VPI->eraseFromParent();
+      continue;
+    }
+
+    if (VPI->getOpcode() == Instruction::Store)
+      if (auto HistInfo = Legal.getHistogramInfo(cast<StoreInst>(Instr))) {
+        ReplaceWith(RecipeBuilder.tryToWidenHistogram(*HistInfo, VPI));
+        continue;
+      }
+
+    VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
+    if (!Recipe)
+      Recipe = RecipeBuilder.handleReplication(cast<VPInstruction>(VPI), Range);
+
+    ReplaceWith(Recipe);
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 38df4468afa2b..55973b0bd1ccd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -26,6 +26,7 @@ namespace llvm {
 class InductionDescriptor;
 class Instruction;
 class Loop;
+class LoopVectorizationLegality;
 class LoopVersioning;
 class OptimizationRemarkEmitter;
 class PHINode;
@@ -482,7 +483,8 @@ struct VPlanTransforms {
 
   static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
                                          VPRecipeBuilder &RecipeBuilder,
-                                         VPCostContext &CostCtx);
+                                         VPCostContext &CostCtx,
+                                         LoopVectorizationLegality &Legal);
 };
 
 } // namespace llvm

>From d1080c47e31ce7abf33808f2b709ba7ffd37061e Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Mon, 23 Feb 2026 14:00:58 -0800
Subject: [PATCH 4/6] Braces for outer `if`

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3c053089ff0af..a932c4f7ccf11 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -6337,11 +6337,12 @@ void VPlanTransforms::makeMemOpWideningDecisions(
       continue;
     }
 
-    if (VPI->getOpcode() == Instruction::Store)
+    if (VPI->getOpcode() == Instruction::Store) {
       if (auto HistInfo = Legal.getHistogramInfo(cast<StoreInst>(Instr))) {
         ReplaceWith(RecipeBuilder.tryToWidenHistogram(*HistInfo, VPI));
         continue;
       }
+    }
 
     VPRecipeBase *Recipe = RecipeBuilder.tryToWidenMemory(VPI, Range);
     if (!Recipe)

>From 6f6d71083c9e76a5770278169fa70283931b3183 Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Tue, 3 Mar 2026 13:42:26 -0800
Subject: [PATCH 5/6] Fold one `Legal` use into `tryToWidenHistogram` renamed
 to `widenIfHistogram`

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp   |  9 +++++++--
 llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h   | 12 ++++++------
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp |  4 ++--
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 14f533e491b0c..2e1974f9543cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7960,8 +7960,13 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
   };
 }
 
-VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
-                                                        VPInstruction *VPI) {
+VPHistogramRecipe *VPRecipeBuilder::widenIfHistogram(VPInstruction *VPI) {
+  auto HistInfo =
+      Legal->getHistogramInfo(cast<StoreInst>(VPI->getUnderlyingInstr()));
+  if (!HistInfo)
+    return nullptr;
+
+  const HistogramInfo *HI = *HistInfo;
   // FIXME: Support other operations.
   unsigned Opcode = HI->Update->getOpcode();
   assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 0c261373e4e1b..080151e7cd2cf 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -85,12 +85,12 @@ class VPRecipeBuilder {
   /// recipe that takes an additional VPInstruction for the mask.
   VPRecipeBase *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
 
-  /// Makes Histogram count operations safe for vectorization, by emitting a
-  /// llvm.experimental.vector.histogram.add intrinsic in place of the
-  /// Load + Add|Sub + Store operations that perform the histogram in the
-  /// original scalar loop.
-  VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
-                                         VPInstruction *VPI);
+  /// If \p VPI represents a histogram operation (as determined by
+  /// LoopVectorizationLegality) make that safe for vectorization, by emitting a
+  /// llvm.experimental.vector.histogram.add intrinsic in place of the Load +
+  /// Add|Sub + Store operations that perform the histogram in the original
+  /// scalar loop.
+  VPHistogramRecipe *widenIfHistogram(VPInstruction *VPI);
 
   /// Set the recipe created for given ingredient.
   void setRecipe(Instruction *I, VPRecipeBase *R) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a932c4f7ccf11..93f80157aaf31 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -6338,8 +6338,8 @@ void VPlanTransforms::makeMemOpWideningDecisions(
     }
 
     if (VPI->getOpcode() == Instruction::Store) {
-      if (auto HistInfo = Legal.getHistogramInfo(cast<StoreInst>(Instr))) {
-        ReplaceWith(RecipeBuilder.tryToWidenHistogram(*HistInfo, VPI));
+      if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
+        ReplaceWith(Histogram);
         continue;
       }
     }

>From e95f9bb052aae2b4d6a693c84bb10d4077fe5da1 Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Tue, 3 Mar 2026 15:37:00 -0800
Subject: [PATCH 6/6] Move another `Legal` use to
 `VPRecipeBuilder::replaceWithFinalIfReductionStore`

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 22 ++++++++++++-
 .../Transforms/Vectorize/VPRecipeBuilder.h    |  8 +++++
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 31 ++++++-------------
 .../Transforms/Vectorize/VPlanTransforms.h    |  4 +--
 4 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2e1974f9543cd..e90f3975786d4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7986,6 +7986,26 @@ VPHistogramRecipe *VPRecipeBuilder::widenIfHistogram(VPInstruction *VPI) {
   return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
 }
 
+bool VPRecipeBuilder::replaceWithFinalIfReductionStore(
+    VPBuilder &FinalRedStoresBuilder, VPInstruction *VPI) {
+  StoreInst *SI;
+  if ((SI = dyn_cast<StoreInst>(VPI->getUnderlyingInstr())) &&
+      Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
+    // Only create recipe for the final invariant store of the reduction.
+    if (Legal->isInvariantStoreOfReduction(SI)) {
+      auto *Recipe = new VPReplicateRecipe(
+          SI, VPI->operandsWithoutMask(), true /* IsUniform */,
+          nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
+      FinalRedStoresBuilder.insert(Recipe);
+      // Recipe->insertBefore(*MiddleVPBB, MBIP);
+    }
+    VPI->eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
 VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
                                                       VFRange &Range) {
   auto *I = VPI->getUnderlyingInstr();
@@ -8258,7 +8278,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
                         OrigLoop);
 
   RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::makeMemOpWideningDecisions, *Plan,
-                           Range, RecipeBuilder, CostCtx, *CM.Legal);
+                           Range, RecipeBuilder, CostCtx);
 
   // Now process all other blocks and instructions.
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 080151e7cd2cf..a908c25de3fd5 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -92,6 +92,14 @@ class VPRecipeBuilder {
   /// scalar loop.
   VPHistogramRecipe *widenIfHistogram(VPInstruction *VPI);
 
+  /// The stores with invariant address inside the loop will be deleted, and in
+  /// the exit block, a uniform store recipe will be created for the final
+  /// invariant store of the reduction. Returns `true` if replacement took
+  /// place. The order of stores must be preserved, hence \p
+  /// FinalRedStoresBuidler.
+  bool replaceWithFinalIfReductionStore(VPBuilder &FinalRedStoresBuilder,
+                                        VPInstruction *VPI);
+
   /// Set the recipe created for given ingredient.
   void setRecipe(Instruction *I, VPRecipeBase *R) {
     assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 93f80157aaf31..137da0fa4267d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -42,7 +42,6 @@
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 
 using namespace llvm;
 using namespace VPlanPatternMatch;
@@ -6276,16 +6275,19 @@ void VPlanTransforms::createPartialReductions(VPlan &Plan,
       transformToPartialReduction(Chain, CostCtx.Types, Plan, Phi);
 }
 
-void VPlanTransforms::makeMemOpWideningDecisions(
-    VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder,
-    VPCostContext &CostCtx, LoopVectorizationLegality &Legal) {
+void VPlanTransforms::makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
+                                                 VPRecipeBuilder &RecipeBuilder,
+                                                 VPCostContext &CostCtx) {
   // Filter out scalar VPlan.
   if (LoopVectorizationPlanner::getDecisionAndClampRange(
           [](ElementCount VF) { return VF.isScalar(); }, Range))
     return;
 
   // Scan the body of the loop in a topological order to visit each basic block
-  // after having visited its predecessor basic blocks.
+  // after having visited its predecessor basic blocks. This is necessary
+  // because we need to preserve the order of the reduction stores into
+  // invariant address when transforming those to a scalar store outside the
+  // vector loop body.
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
@@ -6306,7 +6308,7 @@ void VPlanTransforms::makeMemOpWideningDecisions(
   }
 
   auto *MiddleVPBB = Plan.getMiddleBlock();
-  VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
+  VPBuilder FinalRedStoresBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
 
   for (VPInstruction *VPI : MemOps) {
     Instruction *Instr = cast<Instruction>(VPI->getUnderlyingValue());
@@ -6320,22 +6322,9 @@ void VPlanTransforms::makeMemOpWideningDecisions(
       VPI->eraseFromParent();
     };
 
-    // The stores with invariant address inside the loop will be deleted, and
-    // in the exit block, a uniform store recipe will be created for the final
-    // invariant store of the reduction.
-    StoreInst *SI;
-    if ((SI = dyn_cast<StoreInst>(Instr)) &&
-        Legal.isInvariantAddressOfReduction(SI->getPointerOperand())) {
-      // Only create recipe for the final invariant store of the reduction.
-      if (Legal.isInvariantStoreOfReduction(SI)) {
-        auto *Recipe = new VPReplicateRecipe(
-            SI, VPI->operandsWithoutMask(), true /* IsUniform */,
-            nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
-        Recipe->insertBefore(*MiddleVPBB, MBIP);
-      }
-      VPI->eraseFromParent();
+    if (RecipeBuilder.replaceWithFinalIfReductionStore(FinalRedStoresBuilder,
+                                                       VPI))
       continue;
-    }
 
     if (VPI->getOpcode() == Instruction::Store) {
       if (VPHistogramRecipe *Histogram = RecipeBuilder.widenIfHistogram(VPI)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 55973b0bd1ccd..38df4468afa2b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -26,7 +26,6 @@ namespace llvm {
 class InductionDescriptor;
 class Instruction;
 class Loop;
-class LoopVectorizationLegality;
 class LoopVersioning;
 class OptimizationRemarkEmitter;
 class PHINode;
@@ -483,8 +482,7 @@ struct VPlanTransforms {
 
   static void makeMemOpWideningDecisions(VPlan &Plan, VFRange &Range,
                                          VPRecipeBuilder &RecipeBuilder,
-                                         VPCostContext &CostCtx,
-                                         LoopVectorizationLegality &Legal);
+                                         VPCostContext &CostCtx);
 };
 
 } // namespace llvm



More information about the llvm-commits mailing list