[llvm] [VPlan] Compute cost for predicated loads/stores to invariant address. (PR #181572)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Sun Feb 15 12:42:32 PST 2026


https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/181572

Update VPReplicateRecipe::computeCost to compute the cost for stores to invariant addresses only masked by the header mask.

This matches the legacy cost model logic, but it is slightly odd that the legacy cost model only seems to do this for stores predicated by the header mask (i.e. tail-folding and not executed conditionally otherwise). This is probably something we want to re-evaluate eventually.

>From 7a4aaae1f45d99795aa91e4970384053d4c405e3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 15 Feb 2026 20:41:52 +0000
Subject: [PATCH] [VPlan] Compute cost for predicated loads/stores to invariant
 address.

Update VPReplicateRecipe::computeCost to compute the cost for stores to
invariant addresses only masked by the header mask.

This matches the legacy cost model logic, but it is slightly odd that
the legacy cost model only seems to do this for stores predicated by the
header mask (i.e. tail-folding and not executed conditionally
otherwise). This is probably something we want to re-evaluate
eventually.
---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 51 +++++++++++++++++--
 1 file changed, 48 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bab1fc773fa4b..50e564a97fb6c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3379,6 +3379,48 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
   return false;
 }
 
+/// Return the cost for a predicated load/store with loop-invariant address only
+/// masked by the header mask.
+static InstructionCost
+getPredicatedUniformLoadStoreCost(const VPReplicateRecipe *RepR,
+                                  const SCEV *PtrSCEV, ElementCount VF,
+                                  VPCostContext &Ctx) {
+  if (!Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
+    return InstructionCost::getInvalid();
+
+  const VPRegionBlock *ParentRegion = RepR->getParent()->getParent();
+  auto *BOM =
+      cast<VPBranchOnMaskRecipe>(&ParentRegion->getEntryBasicBlock()->front());
+  if (!vputils::isHeaderMask(BOM->getOperand(0), *ParentRegion->getPlan()))
+    return InstructionCost::getInvalid();
+
+  bool IsLoad = RepR->getOpcode() == Instruction::Load;
+  Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? RepR : RepR->getOperand(0));
+  const VPValue *PtrOp = RepR->getOperand(!IsLoad);
+  Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
+  const Align Alignment = getLoadStoreAlignment(RepR->getUnderlyingInstr());
+  unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
+
+  // Uniform mem op cost, matches getUniformMemOpCost.
+  InstructionCost UniformCost =
+      Ctx.TTI.getAddressComputationCost(ScalarPtrTy, nullptr, nullptr,
+                                        Ctx.CostKind) +
+      Ctx.TTI.getMemoryOpCost(RepR->getOpcode(), ValTy, Alignment, AS,
+                              Ctx.CostKind);
+  auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+  if (IsLoad) {
+    // Load: scalar load + broadcast.
+    UniformCost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
+                                          VectorTy, VectorTy, {}, Ctx.CostKind);
+  } else {
+    VPValue *StoredVal = RepR->getOperand(0);
+    if (!StoredVal->isDefinedOutsideLoopRegions())
+      UniformCost += Ctx.TTI.getIndexedVectorInstrCostFromEnd(
+          Instruction::ExtractElement, VectorTy, Ctx.CostKind, 0);
+  }
+  return UniformCost;
+}
+
 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3557,10 +3599,13 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
 
     const VPRegionBlock *ParentRegion = getRegion();
     if (ParentRegion && ParentRegion->isReplicator()) {
-      // TODO: Handle loop-invariant pointers in predicated blocks. For now,
-      // fall back to the legacy cost model.
-      if (!PtrSCEV || Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
+      if (!PtrSCEV)
         break;
+      InstructionCost UniformCost =
+          getPredicatedUniformLoadStoreCost(this, PtrSCEV, VF, Ctx);
+      if (UniformCost.isValid())
+        return UniformCost;
+
       Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
       Cost += Ctx.TTI.getCFInstrCost(Instruction::Br, Ctx.CostKind);
 



More information about the llvm-commits mailing list