[llvm] b3dcf48 - [VPlan] Compute NumPredStores for VPReplicateRecipe costs in VPlan.

Fri Feb 13 13:17:10 PST 2026

Author: Florian Hahn
Date: 2026-02-13T21:16:53Z
New Revision: b3dcf485d23a574e4c377739a32468240a87c2fc

URL: https://github.com/llvm/llvm-project/commit/b3dcf485d23a574e4c377739a32468240a87c2fc
DIFF: https://github.com/llvm/llvm-project/commit/b3dcf485d23a574e4c377739a32468240a87c2fc.diff

LOG: [VPlan] Compute NumPredStores for VPReplicateRecipe costs in VPlan.

Compute the number of predicated stores directly in VPlan instead of
using CM.useEmulatedMaskMemRefHack(), which will only account for the
number of predicated stores for the last VF the legacy cost model
considered.

Fixes https://github.com/llvm/llvm-project/issues/181183

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/lib/Transforms/Vectorize/VPlan.cpp
    llvm/lib/Transforms/Vectorize/VPlanHelpers.h
    llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 499c5a31421ed..b5feb3feea2e1 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -318,7 +318,7 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
         "Enable runtime interleaving until load/store ports are saturated"));
 
 /// The number of stores in a loop that are allowed to need predication.
-static cl::opt<unsigned> NumberOfStoresToPredicate(
+cl::opt<unsigned> NumberOfStoresToPredicate(
     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
     cl::desc("Max number of stores to be predicated behind an if."));
 
@@ -6890,10 +6890,6 @@ unsigned VPCostContext::getPredBlockCostDivisor(BasicBlock *BB) const {
   return CM.getPredBlockCostDivisor(CostKind, BB);
 }
 
-bool VPCostContext::useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF) {
-  return CM.useEmulatedMaskMemRefHack(I, VF);
-}
-
 InstructionCost
 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
                                           VPCostContext &CostCtx) const {

diff  --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 9b22c399d7acf..c266842f107e8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -67,6 +67,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
 
 extern cl::opt<unsigned> ForceTargetInstructionCost;
 
+extern cl::opt<unsigned> NumberOfStoresToPredicate;
+
 static cl::opt<bool> PrintVPlansInDotFormat(
     "vplan-print-in-dot-format", cl::Hidden,
     cl::desc("Use dot format instead of plain text when dumping VPlans"));
@@ -1803,3 +1805,31 @@ InstructionCost VPCostContext::getScalarizationOverhead(
   return ScalarizationCost +
          TTI.getOperandsScalarizationOverhead(Tys, CostKind, VIC);
 }
+
+bool VPCostContext::useEmulatedMaskMemRefHack(const VPReplicateRecipe *R,
+                                              ElementCount VF) {
+  const Instruction *UI = R->getUnderlyingInstr();
+  if (isa<LoadInst>(UI))
+    return true;
+  assert(isa<StoreInst>(UI) && "R must either be a load or store");
+
+  if (!NumPredStores) {
+    // Count the number of predicated stores in the VPlan, caching the result.
+    const VPlan &Plan = *R->getParent()->getPlan();
+    NumPredStores = 0;
+    for (const VPRegionBlock *VPRB :
+         VPBlockUtils::blocksOnly<const VPRegionBlock>(
+             vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+      assert(VPRB->isReplicator() && "must only contain replicate regions");
+      for (const VPBasicBlock *VPBB :
+           VPBlockUtils::blocksOnly<const VPBasicBlock>(
+               vp_depth_first_shallow(VPRB->getEntry()))) {
+        *NumPredStores += count_if(*VPBB, [](const VPRecipeBase &R) {
+          auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+          return RepR && isa<StoreInst>(RepR->getUnderlyingInstr());
+        });
+      }
+    }
+  }
+  return *NumPredStores > NumberOfStoresToPredicate;
+}

diff  --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index bab7e25cbf407..113ca8c4d0f7c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -346,6 +346,9 @@ struct VPCostContext {
   PredicatedScalarEvolution &PSE;
   const Loop *L;
 
+  /// Number of predicated stores in the VPlan, computed on demand.
+  std::optional<unsigned> NumPredStores;
+
   VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
                 const VPlan &Plan, LoopVectorizationCostModel &CM,
                 TargetTransformInfo::TargetCostKind CostKind,
@@ -385,9 +388,8 @@ struct VPCostContext {
       bool AlwaysIncludeReplicatingR = false);
 
   /// Returns true if an artificially high cost for emulated masked memrefs
-  /// should be used. Forwards to
-  /// LoopVectorizationCostModel::useEmulatedMaskMemRefHack.
-  bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
+  /// should be used.
+  bool useEmulatedMaskMemRefHack(const VPReplicateRecipe *R, ElementCount VF);
 };
 
 /// This class can be used to assign names to VPValues. For VPValues without

diff  --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 389331fd958d2..c15b9fb23060a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3570,7 +3570,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
           VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
           /*Insert=*/false, /*Extract=*/true, Ctx.CostKind);
 
-      if (Ctx.useEmulatedMaskMemRefHack(UI, VF)) {
+      if (Ctx.useEmulatedMaskMemRefHack(this, VF)) {
         // Artificially setting to a high enough value to practically disable
         // vectorization with such operations.
         return 3000000;

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 1ef9b60f63bef..86ea54a92cb1b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -1406,11 +1406,80 @@ exit:
   ret void
 }
 
+define void @predicated_store(ptr %A, ptr noalias %B, ptr noalias %C, ptr %D, ptr %E, double %divisor, i64 %loop.count) #3 {
+; COMMON-LABEL: define void @predicated_store(
+; COMMON-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]], double [[DIVISOR:%.*]], i64 [[LOOP_COUNT:%.*]]) #[[ATTR4:[0-9]+]] {
+; COMMON-NEXT:  [[ENTRY:.*]]:
+; COMMON-NEXT:    br label %[[LOOP_HEADER:.*]]
+; COMMON:       [[LOOP_HEADER]]:
+; COMMON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; COMMON-NEXT:    [[GEP_B:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]]
+; COMMON-NEXT:    [[FIRST_VAL:%.*]] = load double, ptr [[B]], align 8
+; COMMON-NEXT:    store double 0.000000e+00, ptr [[D]], align 8
+; COMMON-NEXT:    [[COEFF:%.*]] = load double, ptr [[C]], align 8
+; COMMON-NEXT:    [[NEG:%.*]] = fneg double [[FIRST_VAL]]
+; COMMON-NEXT:    [[NORM:%.*]] = fdiv double [[NEG]], [[DIVISOR]]
+; COMMON-NEXT:    [[SCALED_COEFF:%.*]] = fmul double [[COEFF]], [[NORM]]
+; COMMON-NEXT:    [[INPUT_VAL:%.*]] = load double, ptr [[GEP_B]], align 8
+; COMMON-NEXT:    [[SCALED_INPUT:%.*]] = fmul double [[INPUT_VAL]], [[DIVISOR]]
+; COMMON-NEXT:    [[PRODUCT:%.*]] = fmul double [[SCALED_COEFF]], [[SCALED_INPUT]]
+; COMMON-NEXT:    [[FINAL:%.*]] = fmul double [[PRODUCT]], [[DIVISOR]]
+; COMMON-NEXT:    [[IS_POS:%.*]] = fcmp ogt double [[FINAL]], 0.000000e+00
+; COMMON-NEXT:    br i1 [[IS_POS]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; COMMON:       [[THEN]]:
+; COMMON-NEXT:    store double 0.000000e+00, ptr [[A]], align 8
+; COMMON-NEXT:    [[GEP_E:%.*]] = getelementptr i8, ptr [[E]], i64 [[IV]]
+; COMMON-NEXT:    store double 0.000000e+00, ptr [[GEP_E]], align 8
+; COMMON-NEXT:    br label %[[LOOP_LATCH]]
+; COMMON:       [[LOOP_LATCH]]:
+; COMMON-NEXT:    store double 0.000000e+00, ptr null, align 8
+; COMMON-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; COMMON-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[LOOP_COUNT]]
+; COMMON-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; COMMON:       [[EXIT]]:
+; COMMON-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.B = getelementptr double, ptr %B, i64 %iv
+  %first.val = load double, ptr %B, align 8
+  store double 0.000000e+00, ptr %D, align 8
+  %coeff = load double, ptr %C, align 8
+  %neg = fneg double %first.val
+  %norm = fdiv double %neg, %divisor
+  %scaled.coeff = fmul double %coeff, %norm
+  %input.val = load double, ptr %gep.B, align 8
+  %scaled.input = fmul double %input.val, %divisor
+  %product = fmul double %scaled.coeff, %scaled.input
+  %final = fmul double %product, %divisor
+  %is.pos = fcmp ogt double %final, 0.000000e+00
+  br i1 %is.pos, label %then, label %loop.latch
+
+then:
+  store double 0.000000e+00, ptr %A, align 8
+  %gep.E = getelementptr i8, ptr %E, i64 %iv
+  store double 0.000000e+00, ptr %gep.E, align 8
+  br label %loop.latch
+
+loop.latch:
+  store double 0.000000e+00, ptr null, align 8
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %loop.count
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
 declare float @llvm.fmuladd.f32(float, float, float) #1
 declare double @llvm.pow.f64(double, double)
 
 attributes #1 = { "target-cpu"="neoverse-512tvb" }
 attributes #2 = { vscale_range(2,2) "target-cpu"="neoverse-512tvb" }
+attributes #3 = { "target-cpu"="neoverse-v2" }
 
 !0 = distinct !{!0, !1, !2, !3}
 !1 = !{!"llvm.loop.vectorize.width", i32 8}