[llvm] [VPlan] Remove legacy costing inside VPBlendRecipe::computeCost (PR #171846)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 11 07:18:04 PST 2025
https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/171846
A VPBlendRecipe always emits selects, even when the VF is scalar.
However the legacy cost model always costs all scalar non-header phis as a phi, and the VPlan cost model has to account for this.
This can cause the cost to be a little off, for example not including the cost of the select in @smax_call_uniform leading to unprofitable vectorization.
This removes this from the VPlan cost model and handles checks for the case in planContainsAdditionalSimplifications instead.
I considered trying to make the legacy cost model more accurate but I'm not sure if it's possible. We need information as to whether or not the scalar VF we are costing is the original loop in which case it's actually a phi, or if it's a VPBlendRecipe that emits a select, potentially from a VF=1, UF>=1 VPlan.
>From 4a502ff36a88fd7ca76e3696226dc44c37759b30 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 11 Dec 2025 22:48:08 +0800
Subject: [PATCH] [VPlan] Remove legacy costing inside
VPBlendRecipe::computeCost
A VPBlendRecipe always emits selects, even when the VF is scalar.
However the legacy cost model always costs all scalar non-header phis as a phi, and the VPlan cost model has to account for this.
This can cause the cost to be a little off, for example not including the cost of the select in @smax_call_uniform leading to unprofitable vectorization.
This removes this from the VPlan cost model and handles checks for the case in planContainsAdditionalSimplifications instead.
I considered trying to make the legacy cost model more accurate but I'm not sure if it's possible. We need information as to whether or not the scalar VF we are costing is the original loop in which case it's actually a phi, or if it's a VPBlendRecipe that emits a select, potentially from a VF=1, UF>=1 VPlan.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 5 ++
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 --
.../X86/replicate-uniform-call.ll | 54 +++++--------------
3 files changed, 18 insertions(+), 46 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b3577e4ecbca8..52b0360e23074 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7081,6 +7081,11 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
return true;
}
+ // The legacy cost model costs non-header phis with a scalar VF as a phi,
+ // but scalar unrolled VPlans will have VPBlendRecipes which emit selects.
+ if (VF.isScalar() && isa<VPBlendRecipe>(&R))
+ return true;
+
/// If a VPlan transform folded a recipe to one producing a single-scalar,
/// but the original instruction wasn't uniform-after-vectorization in the
/// legacy cost model, the legacy cost overestimates the actual cost.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3c0f02c0d7d1c..6c7bedaf2c933 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2551,11 +2551,6 @@ void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- // Handle cases where only the first lane is used the same way as the legacy
- // cost model.
- if (vputils::onlyFirstLaneUsed(this))
- return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
-
Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
return (getNumIncomingValues() - 1) *
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
index 70b05ac34559e..611a097024134 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
@@ -8,48 +8,25 @@ target triple = "x86_64-unknown-linux-gnu"
define void @smax_call_uniform(ptr %dst, i64 %x) {
; CHECK-LABEL: define void @smax_call_uniform(
; CHECK-SAME: ptr [[DST:%.*]], i64 [[X:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[C:%.*]] = icmp ult i8 -68, -69
; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[X]], 0
-; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
-; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]]
-; CHECK: [[PRED_UREM_IF]]:
-; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE]]
-; CHECK: [[PRED_UREM_CONTINUE]]:
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]]
-; CHECK: [[PRED_UREM_IF1]]:
-; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE2]]
-; CHECK: [[PRED_UREM_CONTINUE2]]:
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_UREM_IF3:.*]], label %[[PRED_UREM_CONTINUE4:.*]]
-; CHECK: [[PRED_UREM_IF3]]:
-; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE4]]
-; CHECK: [[PRED_UREM_CONTINUE4]]:
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UREM_IF5:.*]], label %[[PRED_UREM_CONTINUE6]]
-; CHECK: [[PRED_UREM_IF5]]:
-; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE6]]
-; CHECK: [[PRED_UREM_CONTINUE6]]:
-; CHECK-NEXT: [[TMP13:%.*]] = tail call i64 @llvm.smax.i64(i64 0, i64 0)
-; CHECK-NEXT: [[PREDPHI7:%.*]] = select i1 [[C]], i64 1, i64 [[TMP13]]
+; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
+; CHECK: [[LOOP_HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT: br i1 [[C]], label %[[LOOP_LATCH]], label %[[ELSE:.*]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: [[REM:%.*]] = urem i64 [[MUL]], [[X]]
+; CHECK-NEXT: [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[REM]], i64 0)
+; CHECK-NEXT: br label %[[LOOP_LATCH]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[PREDPHI7:%.*]] = phi i64 [ 1, %[[LOOP_HEADER]] ], [ [[SMAX]], %[[ELSE]] ]
; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[PREDPHI7]], 1
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP17]]
; CHECK-NEXT: store i64 0, ptr [[TMP19]], align 8
-; CHECK-NEXT: store i64 0, ptr [[TMP19]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK-NEXT: br i1 [[TMP20]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
@@ -81,8 +58,3 @@ exit:
}
declare i64 @llvm.smax.i64(i64, i64)
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-;.
More information about the llvm-commits
mailing list