[llvm] [VPlan] Generalize licm-sink to strip zero-user bail (PR #187077)
Ramkumar Ramachandra via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 31 01:13:27 PDT 2026
https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/187077
>From 39b66ee1e192331753378921a4af62612d978412 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <artagnon at tenstorrent.com>
Date: Tue, 17 Mar 2026 16:44:27 +0000
Subject: [PATCH] [VPlan] Generalize licm-sink to strip zero-user bail
In preparation to sink unpredicated stores, strip the zero-user bail in
the licm transform, generalizing it to sink to all successors of the
vector loop.
---
.../Transforms/Vectorize/VPlanTransforms.cpp | 67 ++++++++++---------
.../LoopVectorize/AArch64/select-costs.ll | 8 +--
.../LoopVectorize/scalable-assume.ll | 8 +--
3 files changed, 43 insertions(+), 40 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f779aa92a3aa7..6336b3a76d252 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2679,9 +2679,8 @@ static void licm(VPlan &Plan) {
#ifndef NDEBUG
VPDominatorTree VPDT(Plan);
#endif
- // Sink recipes with no users inside the vector loop region if all users are
- // in the same exit block of the region.
- // TODO: Extend to sink recipes from inner loops.
+ // Sink recipes with in the vector loop region to successors of the loop
+ // region.
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_post_order_shallow(LoopRegion->getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
@@ -2703,43 +2702,47 @@ static void licm(VPlan &Plan) {
// TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
// support recipes with multiple defined values (e.g., interleaved loads).
auto *Def = cast<VPSingleDefRecipe>(&R);
- // Skip recipes without users as we cannot determine a sink block.
- // TODO: Clone sinkable recipes without users to all exit blocks to reduce
- // their execution frequency.
- if (Def->getNumUsers() == 0)
- continue;
- VPBasicBlock *SinkBB = nullptr;
- // Cannot sink the recipe if any user
- // * is defined in any loop region, or
- // * is a phi, or
- // * multiple users in different blocks.
- if (any_of(Def->users(), [&SinkBB](VPUser *U) {
+ // Cannot sink the recipe if the user is defined in a loop region or a
+ // non-successor of the vector loop region. Cannot sink if user is a phi
+ // either.
+ if (any_of(Def->users(), [&LoopRegion](VPUser *U) {
auto *UserR = cast<VPRecipeBase>(U);
VPBasicBlock *Parent = UserR->getParent();
// TODO: If the user is a PHI node, we should check the block of
// incoming value. Support PHI node users if needed.
- if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
- return true;
- // TODO: Support sinking when users are in multiple blocks.
- if (SinkBB && SinkBB != Parent)
- return true;
- SinkBB = Parent;
- return false;
+ return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
+ Parent->getSinglePredecessor() != LoopRegion;
}))
continue;
- // Only sink to dedicated exit blocks of the loop region.
- if (SinkBB->getSinglePredecessor() != LoopRegion)
- continue;
-
- // TODO: This will need to be a check instead of a assert after
- // conditional branches in vectorized loops are supported.
- assert(VPDT.properlyDominates(VPBB, SinkBB) &&
- "Defining block must dominate sink block");
- // TODO: Clone the recipe if users are on multiple exit paths, instead of
- // just moving.
- Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
+ // Compute the users' parent blocks if there are users. Otherwise, sink to
+ // all successor blocks.
+ SmallVector<VPBasicBlock *> SinkBlocks;
+ if (Def->users().empty())
+ append_range(SinkBlocks, map_range(LoopRegion->successors(),
+ [](VPBlockBase *SuccBB) {
+ return cast<VPBasicBlock>(SuccBB);
+ }));
+ else
+ append_range(SinkBlocks, map_range(Def->users(), [](VPUser *U) {
+ return cast<VPRecipeBase>(U)->getParent();
+ }));
+
+ for (auto *SinkTo : SinkBlocks) {
+ // This will need to be a check instead of a assert after conditional
+ // branches in vectorized loops are supported.
+ assert(VPDT.properlyDominates(VPBB, SinkTo) &&
+ "Defining block must dominate sink block");
+ auto *Clone = Def->clone();
+ Clone->insertBefore(*SinkTo, SinkTo->getFirstNonPhi());
+ Def->replaceUsesWithIf(Clone, [&SinkTo](VPUser &U, unsigned) {
+ return cast<VPRecipeBase>(U).getParent() == SinkTo;
+ });
+ }
+ // Avoid erroneously bailing on the already-sunk recipe when we get to the
+ // next recipe in post-order and query its users.
+ Def->eraseFromParent();
}
}
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
index add503acb9420..5a36042ad19b3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
@@ -92,10 +92,10 @@ exit:
define i32 @select_xor_cond(ptr %src, i1 %c.0) {
; CHECK: LV: Checking a loop in 'select_xor_cond'
-; CHECK: Cost of 1 for VF 2: WIDEN ir<%sel> = select ir<%c>, ir<false>, ir<%c.0>
-; CHECK: Cost of 1 for VF 4: WIDEN ir<%sel> = select ir<%c>, ir<false>, ir<%c.0>
-; CHECK: Cost of 1 for VF 8: WIDEN ir<%sel> = select ir<%c>, ir<false>, ir<%c.0>
-; CHECK: Cost of 1 for VF 16: WIDEN ir<%sel> = select ir<%c>, ir<false>, ir<%c.0>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%sel> = select ir<%c>{{.*}}, ir<false>, ir<%c.0>
+; CHECK: Cost of 1 for VF 4: WIDEN ir<%sel> = select ir<%c>{{.*}}, ir<false>, ir<%c.0>
+; CHECK: Cost of 1 for VF 8: WIDEN ir<%sel> = select ir<%c>{{.*}}, ir<false>, ir<%c.0>
+; CHECK: Cost of 1 for VF 16: WIDEN ir<%sel> = select ir<%c>{{.*}}, ir<false>, ir<%c.0>
; CHECK: LV: Selecting VF: 4.
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
index 0a25460a52fad..3f29bbb22a3e1 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -24,10 +24,6 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <vscale x 2 x float> [[WIDE_LOAD]], i32 0
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x float>, ptr [[TMP9]], align 4
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <vscale x 2 x float> [[WIDE_LOAD1]], i32 0
-; CHECK-NEXT: [[FCMP1:%.*]] = fcmp ogt float [[TMP10]], 1.000000e+02
-; CHECK-NEXT: [[FCMP2:%.*]] = fcmp ogt float [[TMP12]], 1.000000e+02
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP1]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP2]])
; CHECK-NEXT: [[TMP14:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
; CHECK-NEXT: [[TMP15:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00)
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
@@ -38,6 +34,10 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP17:%.*]] = fcmp ogt float [[TMP10]], 1.000000e+02
+; CHECK-NEXT: [[TMP18:%.*]] = fcmp ogt float [[TMP12]], 1.000000e+02
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP17]])
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP18]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1600, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
More information about the llvm-commits
mailing list