[llvm] [LV] Don't skip VPlan cost model for div/rem instructions (PR #187056)

Fri Mar 20 06:54:04 PDT 2026

https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/187056

>From a80f58ad97cee45f52c9698c4e7cc0e936da67ec Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 17 Mar 2026 16:37:44 +0000
Subject: [PATCH 1/3] [LV] Don't skip VPlan cost model for div/rem instructions

In LoopVectorizationPlanner::precomputeCosts we are skipping
calculation of costs using the VPlan cost model, instead
preferring to use the legacy costs. This helps to prevent
the legacy and vplan cost model assert firing, but really
we should be encouraging full use of the VPlan cost model.

I've created this initial PR to stop skipping the
computation costs for udiv/urem/sdiv/srem instructions. The
VPlan costs seem to match up nicely.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp   | 15 ++++++++++++++-
 .../LoopVectorize/AArch64/aarch64-predication.ll  |  2 +-
 .../LoopVectorize/AArch64/predication_costs.ll    |  8 ++++----
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ac9b790c739bf..59b039a75eec2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7009,8 +7009,21 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
     });
     Cost += ForcedCost;
   }
+
+  auto UseVPlanCostModel = [](Instruction *I) -> bool {
+    switch (I->getOpcode()) {
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::SRem:
+    case Instruction::URem:
+      return true;
+    default:
+      return false;
+    }
+  };
   for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
-    if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))
+    if (UseVPlanCostModel(Scalarized) ||
+        CostCtx.skipCostComputation(Scalarized, VF.isVector()))
       continue;
     CostCtx.SkipCostComputation.insert(Scalarized);
     LLVM_DEBUG({
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index 1f3949172b758..983c1b9c2b902 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -13,7 +13,7 @@ target triple = "aarch64--linux-gnu"
 ; %var4 a lower scalarization overhead.
 ;
 ; COST-LABEL:  predicated_udiv_scalarized_operand
-; COST:        Cost of 5 for VF 2: profitable to scalarize   %var4 = udiv i64 %var2, %var3
+; COST:        Cost of 5 for VF 2: REPLICATE ir<%var4> = udiv ir<%var2>, ir<%var3> (S->V)
 ;
 ;
 define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
index d84a6e27e5473..944632a796bdb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -19,7 +19,7 @@ target triple = "aarch64--linux-gnu"
 ;   (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
 ;
 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
-; CHECK: Cost of 7 for VF 2: profitable to scalarize   %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Cost of 7 for VF 2: REPLICATE ir<%tmp4> = udiv ir<%tmp2>, ir<%tmp3> (S->V)
 ;
 define i32 @predicated_udiv(ptr %a, ptr %b, i1 %c, i64 %n) {
 entry:
@@ -135,8 +135,8 @@ for.end:
 ;
 ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
-; CHECK: Cost of 5 for VF 2: profitable to scalarize   %tmp4 = udiv i32 %tmp2, %tmp3
 ; CHECK: Cost of 3 for VF 2: profitable to scalarize   %tmp3 = add nsw i32 %tmp2, %x
+; CHECK: Cost of 5 for VF 2: REPLICATE ir<%tmp4> = udiv ir<%tmp2>, ir<%tmp3> (S->V)
 ;
 
 define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
@@ -233,11 +233,11 @@ for.end:
 ; CHECK:     Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
 ; CHECK:     Scalarizing: %tmp5 = sub i32 %tmp4, %x
 ; CHECK:     Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
-; CHECK: Cost of 7 for VF 2: profitable to scalarize   %tmp3 = sdiv i32 %tmp1, %tmp2
-; CHECK: Cost of 7 for VF 2: profitable to scalarize   %tmp4 = udiv i32 %tmp3, %tmp2
 ; CHECK: Cost of 2 for VF 2: profitable to scalarize   store i32 %tmp5, ptr %tmp0, align 4
 ; CHECK: Cost of 3 for VF 2: profitable to scalarize   %tmp5 = sub i32 %tmp4, %x
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%tmp2> = add ir<%tmp1>, ir<%x>
+; CHECK: Cost of 7 for VF 2: REPLICATE ir<%tmp3> = sdiv ir<%tmp1>, ir<%tmp2>
+; CHECK: Cost of 5 for VF 2: REPLICATE ir<%tmp4> = udiv ir<%tmp3>, ir<%tmp2>
 ;
 define void @predication_multi_context(ptr %a, i1 %c, i32 %x, i64 %n) {
 entry:

>From c41ee17185f8eac5878f4f459747ca336f84d58d Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 18 Mar 2026 16:37:04 +0000
Subject: [PATCH 2/3] Address review comments

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp             | 6 ++++--
 .../Transforms/LoopVectorize/AArch64/aarch64-predication.ll | 4 +---
 .../Transforms/LoopVectorize/AArch64/predication_costs.ll   | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 59b039a75eec2..32d2aeded4176 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6996,8 +6996,10 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
     return Cost;
 
   // Pre-compute costs for instructions that are forced-scalar or profitable to
-  // scalarize. Their costs will be computed separately in the legacy cost
-  // model.
+  // scalarize. For most such instructions, their scalarization costs are
+  // accounted for here using the legacy cost model. However, some opcodes
+  // are excluded from these precomputed scalarization costs and are instead
+  // modeled later by the VPlan cost model (see UseVPlanCostModel below).
   for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
     if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))
       continue;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index 983c1b9c2b902..ba8a4c735b94d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -8,9 +8,7 @@ target triple = "aarch64--linux-gnu"
 
 ; This test checks that we correctly compute the scalarized operands for a
 ; user-specified vectorization factor when interleaving is disabled. We use
-; -force-vector-interleave=1 to disable all interleaving calculations. A cost of
-; 4 for %var4 indicates that we would scalarize it's operand (%var3), giving
-; %var4 a lower scalarization overhead.
+; -force-vector-interleave=1 to disable all interleaving calculations.
 ;
 ; COST-LABEL:  predicated_udiv_scalarized_operand
 ; COST:        Cost of 5 for VF 2: REPLICATE ir<%var4> = udiv ir<%var2>, ir<%var3> (S->V)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
index 944632a796bdb..b9b91be9b7a65 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -222,7 +222,7 @@ for.end:
 ; Cost of sdiv:
 ;   (sdiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
 ; Cost of udiv:
-;   (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
+;   (udiv(2) + extractelement(4) + insertelement(4)) / 2 = 5
 ; Cost of sub:
 ;   (sub(2) + extractelement(4)) / 2 = 3
 ; Cost of store:

>From 9e4a2e834550db3ee06d54a55fc465a2d1a3dc3c Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Fri, 20 Mar 2026 13:52:33 +0000
Subject: [PATCH 3/3] Improve div/rem cost model

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp           | 15 +++++++++++++--
 llvm/lib/Transforms/Vectorize/VPlanHelpers.h      |  1 +
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp    |  5 +++--
 .../LoopVectorize/AArch64/predication_costs.ll    |  8 ++++----
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index da631984a9a3c..6521def4da1c2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1787,7 +1787,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
 
 InstructionCost VPCostContext::getScalarizationOverhead(
     Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
-    TTI::VectorInstrContext VIC, bool AlwaysIncludeReplicatingR) {
+    const VPSingleDefRecipe *R, TTI::VectorInstrContext VIC,
+    bool AlwaysIncludeReplicatingR) {
   if (VF.isScalar())
     return 0;
 
@@ -1796,7 +1797,17 @@ InstructionCost VPCostContext::getScalarizationOverhead(
 
   InstructionCost ScalarizationCost = 0;
   // Compute the cost of scalarizing the result if needed.
-  if (!ResultTy->isVoidTy()) {
+  bool ScalarizeResult = !ResultTy->isVoidTy();
+  if (ScalarizeResult && R) {
+    // Is this recipe only used by other recipes in the same block? If so, the
+    // result does not need scalarizing since it's only use will be scalar.
+    ScalarizeResult = llvm::any_of(R->users(), [R](VPUser *U) {
+      auto *UR = dyn_cast<VPRecipeBase>(U);
+      return UR && R->getParent() != UR->getParent();
+    });
+  }
+
+  if (ScalarizeResult) {
     for (Type *VectorTy :
          to_vector(getContainedTypes(toVectorizedTy(ResultTy, VF)))) {
       ScalarizationCost += TTI.getScalarizationOverhead(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 113ca8c4d0f7c..c7e5521705077 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -384,6 +384,7 @@ struct VPCostContext {
   /// replicating operands.
   InstructionCost getScalarizationOverhead(
       Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
+      const VPSingleDefRecipe *R = nullptr,
       TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None,
       bool AlwaysIncludeReplicatingR = false);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 401f6725677e3..4f77e980f59b3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3518,7 +3518,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
 
     ScalarCost = ScalarCost * VF.getFixedValue() +
                  Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
-                                              to_vector(operands()), VF);
+                                              to_vector(operands()), VF, this);
     // If the recipe is not predicated (i.e. not in a replicate region), return
     // the scalar cost. Otherwise handle predicated cost.
     if (!getRegion()->isReplicator())
@@ -3583,7 +3583,8 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
         IsLoad ? TTI::VectorInstrContext::Load : TTI::VectorInstrContext::Store;
     InstructionCost Cost =
         (ScalarCost * VF.getFixedValue()) +
-        Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, VIC, true);
+        Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, nullptr, VIC,
+                                     true);
 
     const VPRegionBlock *ParentRegion = getRegion();
     if (ParentRegion && ParentRegion->isReplicator()) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
index b9b91be9b7a65..b1568599f8242 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -220,9 +220,9 @@ for.end:
 ; Cost of add:
 ;   add(1) = 1
 ; Cost of sdiv:
-;   (sdiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
+;   (sdiv(2) + extractelement(8)) / 2 = 5
 ; Cost of udiv:
-;   (udiv(2) + extractelement(4) + insertelement(4)) / 2 = 5
+;   (udiv(2) + extractelement(4)) / 2 = 3
 ; Cost of sub:
 ;   (sub(2) + extractelement(4)) / 2 = 3
 ; Cost of store:
@@ -236,8 +236,8 @@ for.end:
 ; CHECK: Cost of 2 for VF 2: profitable to scalarize   store i32 %tmp5, ptr %tmp0, align 4
 ; CHECK: Cost of 3 for VF 2: profitable to scalarize   %tmp5 = sub i32 %tmp4, %x
 ; CHECK: Cost of 1 for VF 2: WIDEN ir<%tmp2> = add ir<%tmp1>, ir<%x>
-; CHECK: Cost of 7 for VF 2: REPLICATE ir<%tmp3> = sdiv ir<%tmp1>, ir<%tmp2>
-; CHECK: Cost of 5 for VF 2: REPLICATE ir<%tmp4> = udiv ir<%tmp3>, ir<%tmp2>
+; CHECK: Cost of 5 for VF 2: REPLICATE ir<%tmp3> = sdiv ir<%tmp1>, ir<%tmp2>
+; CHECK: Cost of 3 for VF 2: REPLICATE ir<%tmp4> = udiv ir<%tmp3>, ir<%tmp2>
 ;
 define void @predication_multi_context(ptr %a, i1 %c, i32 %x, i64 %n) {
 entry: