[llvm] [LV] Always include middle block cost in isOutsideLoopWorkProfitable. (PR #171102)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 8 07:04:08 PST 2025
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/171102
>From c63302e586923a90f208f9d10f71be6c1ec05d21 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 8 Dec 2025 14:26:53 +0000
Subject: [PATCH 1/2] [VPlan] Remove ExtractLastLane for plans with scalar VFs.
ExtractLastLane is a no-op for scalar VFs. Update simplifyRecipe to
remove them. This also requires adjusting the code in VPlanUnroll.cpp
to split off handling of ExtractLastLane/ExtractPenultimateElement for
scalar VFs, which now needs to match ExtractLastPart.
---
.../lib/Transforms/Vectorize/VPlanTransforms.cpp | 16 ++++++++++------
llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 12 +++++++-----
.../interleave-and-scalarize-only.ll | 3 +--
3 files changed, 18 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 320baeb454d46..4ad098d748568 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1385,12 +1385,16 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
return;
}
- // Look through ExtractLastLane (BuildVector ....).
- if (match(Def, m_ExtractLastLane(m_BuildVector()))) {
- auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
- Def->replaceAllUsesWith(
- BuildVector->getOperand(BuildVector->getNumOperands() - 1));
- return;
+ // Look through ExtractLastLane.
+ if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
+ if (match(A, m_BuildVector())) {
+ auto *BuildVector = cast<VPInstruction>(A);
+ Def->replaceAllUsesWith(
+ BuildVector->getOperand(BuildVector->getNumOperands() - 1));
+ return;
+ }
+ if (Plan->hasScalarVFOnly())
+ return Def->replaceAllUsesWith(A);
}
// Look through ExtractPenultimateElement (BuildVector ....).
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 6fb706ea7d64b..7b4c524712d9a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -371,10 +371,9 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
continue;
}
- if (match(&R, m_ExtractLastLaneOfLastPart(m_VPValue(Op0))) ||
- match(&R, m_ExtractPenultimateElement(m_VPValue(Op0)))) {
- addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
- if (Plan.hasScalarVFOnly()) {
+ if (Plan.hasScalarVFOnly()) {
+ if (match(&R, m_ExtractLastPart(m_VPValue(Op0))) ||
+ match(&R, m_ExtractPenultimateElement(m_VPValue(Op0)))) {
auto *I = cast<VPInstruction>(&R);
bool IsPenultimatePart =
I->getOpcode() == VPInstruction::ExtractPenultimateElement;
@@ -383,7 +382,10 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
I->replaceAllUsesWith(getValueForPart(Op0, PartIdx));
continue;
}
- // For vector VF, always extract from the last part.
+ }
+ if (match(&R, m_ExtractLastLaneOfLastPart(m_VPValue(Op0))) ||
+ match(&R, m_ExtractPenultimateElement(m_VPValue(Op0)))) {
+ addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
R.setOperand(0, getValueForPart(Op0, UF - 1));
continue;
}
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index bbd596a772c53..c77afa870e2c1 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -220,7 +220,6 @@ exit:
; DBG-EMPTY:
; DBG-NEXT: middle.block:
; DBG-NEXT: EMIT vp<[[RESUME_1_PART:%.+]]> = extract-last-part vp<[[SCALAR_STEPS]]>
-; DBG-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-last-lane vp<[[RESUME_1_PART]]>
; DBG-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
; DBG-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; DBG-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -230,7 +229,7 @@ exit:
; DBG-EMPTY:
; DBG-NEXT: scalar.ph:
; DBG-NEXT: EMIT-SCALAR vp<[[RESUME_IV:%.+]]> = phi [ vp<[[VTC]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; DBG-NEXT: EMIT-SCALAR vp<[[RESUME_P:%.*]]> = phi [ vp<[[RESUME_1]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; DBG-NEXT: EMIT-SCALAR vp<[[RESUME_P:%.*]]> = phi [ vp<[[RESUME_1_PART]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
; DBG-NEXT: Successor(s): ir-bb<loop>
; DBG-EMPTY:
; DBG-NEXT: ir-bb<loop>:
>From 3594837b37a66d5f001ccab46621875bde207c96 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 7 Dec 2025 22:24:00 +0000
Subject: [PATCH 2/2] [LV] Always include middle block cost in
isOutsideLoopWorkProfitable.
Always include the cost of the middle block in
isOutsideLoopWorkProfitable. This addresses the TODO from
https://github.com/llvm/llvm-project/pull/168949 and removes the
temporary restriction.
isOutsideLoopWorkProfitable already scales the cost outside loops
according the expected trip counts.
In practice this increases the minimum iteration threshold in a few
cases. On a large IR corpus based on C/C++ workloads, ~50 out of 179450
vector loops have their thresholds increased slightly.
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 +-------
.../LoopVectorize/AArch64/early_exit_costs.ll | 2 +-
.../LoopVectorize/AArch64/induction-costs.ll | 2 +-
.../AArch64/low_trip_memcheck_cost.ll | 14 +++++++-------
.../AArch64/scalable-avoid-scalarization.ll | 3 ++-
5 files changed, 12 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4edc004f161a1..c07663ad9670c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9329,13 +9329,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
// one exists.
TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
- // If the expected trip count is less than the VF, the vector loop will only
- // execute a single iteration. Then the middle block is executed the same
- // number of times as the vector region.
- // TODO: Extend logic to always account for the cost of the middle block.
- auto ExpectedTC = getSmallBestKnownTC(PSE, L);
- if (ExpectedTC && ElementCount::isKnownLE(*ExpectedTC, VF.Width))
- TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx);
+ TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx);
// When interleaving only scalar and vector cost will be equal, which in turn
// would lead to a divide by 0. Fall back to hard threshold.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
index 7ae50a5e4a075..de5870e269b67 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
@@ -96,7 +96,7 @@ define i64 @vectorization_not_profitable_due_to_trunc(ptr dereferenceable(800) %
; CHECK-NEXT: Calculating cost of work in exit block vector.early.exit:
; CHECK-NEXT: Cost of 1 for VF 1: EMIT vp<%first.active.lane> = first-active-lane ir<%t>
; CHECK-NEXT: Cost of 0 for VF 1: EMIT vp<%early.exit.value> = extract-lane vp<%first.active.lane>, ir<%l>
-; CHECK-NEXT: LV: Vectorization is possible but not beneficial.
+; CHECK: LV: Vectorization is possible but not beneficial.
entry:
br label %loop.header
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 7b42e565e127d..40db6a53b49e4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -94,7 +94,7 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]]
; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
index 611b980999bfe..df1c639911cb0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -8,7 +8,7 @@ define void @no_outer_loop(ptr nocapture noundef %a, ptr nocapture noundef reado
; CHECK: Calculating cost of runtime checks:
; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop.
; CHECK: Total cost of runtime checks: 4
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
entry:
br label %inner.loop
@@ -34,7 +34,7 @@ define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonl
; CHECK: Calculating cost of runtime checks:
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3
; CHECK: Total cost of runtime checks: 3
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
entry:
br label %outer.loop
@@ -71,7 +71,7 @@ define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef rea
; CHECK: Calculating cost of runtime checks:
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
; CHECK: Total cost of runtime checks: 2
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
entry:
br label %outer.loop
@@ -108,7 +108,7 @@ define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef re
; CHECK: Calculating cost of runtime checks:
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1
; CHECK: Total cost of runtime checks: 1
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
entry:
br label %outer.loop
@@ -145,7 +145,7 @@ define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonl
; CHECK: Calculating cost of runtime checks:
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
; CHECK: Total cost of runtime checks: 2
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
entry:
br label %outer.loop
@@ -182,7 +182,7 @@ define void @outer_pgo_minus1(ptr nocapture noundef %a, ptr nocapture noundef re
; CHECK: Calculating cost of runtime checks:
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1
; CHECK: Total cost of runtime checks: 1
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
entry:
br label %outer.loop
@@ -219,7 +219,7 @@ define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr n
; CHECK: Calculating cost of runtime checks:
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
; CHECK: Total cost of runtime checks: 2
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
+; CHECK: LV: Minimum required TC for runtime checks to be profitable:4
entry:
br label %outer.loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
index e338b828d2520..dd6f0fe5f1292 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
@@ -16,7 +16,8 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0
; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX]], [[IDX]]
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], [[TMP3]]
+; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP3]], i32 6)
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], [[UMAX]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
More information about the llvm-commits
mailing list