[llvm] [VPlan] Generalize licm-sink to strip zero-user bail (PR #187077)

Tue Mar 31 01:13:27 PDT 2026

https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/187077

>From 39b66ee1e192331753378921a4af62612d978412 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <artagnon at tenstorrent.com>
Date: Tue, 17 Mar 2026 16:44:27 +0000
Subject: [PATCH] [VPlan] Generalize licm-sink to strip zero-user bail

In preparation to sink unpredicated stores, strip the zero-user bail in
the licm transform, generalizing it to sink to all successors of the
vector loop.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 67 ++++++++++---------
 .../LoopVectorize/AArch64/select-costs.ll     |  8 +--
 .../LoopVectorize/scalable-assume.ll          |  8 +--
 3 files changed, 43 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f779aa92a3aa7..6336b3a76d252 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2679,9 +2679,8 @@ static void licm(VPlan &Plan) {
 #ifndef NDEBUG
   VPDominatorTree VPDT(Plan);
 #endif
-  // Sink recipes with no users inside the vector loop region if all users are
-  // in the same exit block of the region.
-  // TODO: Extend to sink recipes from inner loops.
+  // Sink recipes with in the vector loop region to successors of the loop
+  // region.
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_post_order_shallow(LoopRegion->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
@@ -2703,43 +2702,47 @@ static void licm(VPlan &Plan) {
       // TODO: Use R.definedValues() instead of casting to VPSingleDefRecipe to
       // support recipes with multiple defined values (e.g., interleaved loads).
       auto *Def = cast<VPSingleDefRecipe>(&R);
-      // Skip recipes without users as we cannot determine a sink block.
-      // TODO: Clone sinkable recipes without users to all exit blocks to reduce
-      // their execution frequency.
-      if (Def->getNumUsers() == 0)
-        continue;
 
-      VPBasicBlock *SinkBB = nullptr;
-      // Cannot sink the recipe if any user
-      //  * is defined in any loop region, or
-      //  * is a phi, or
-      //  * multiple users in different blocks.
-      if (any_of(Def->users(), [&SinkBB](VPUser *U) {
+      // Cannot sink the recipe if the user is defined in a loop region or a
+      // non-successor of the vector loop region. Cannot sink if user is a phi
+      // either.
+      if (any_of(Def->users(), [&LoopRegion](VPUser *U) {
             auto *UserR = cast<VPRecipeBase>(U);
             VPBasicBlock *Parent = UserR->getParent();
             // TODO: If the user is a PHI node, we should check the block of
             // incoming value. Support PHI node users if needed.
-            if (UserR->isPhi() || Parent->getEnclosingLoopRegion())
-              return true;
-            // TODO: Support sinking when users are in multiple blocks.
-            if (SinkBB && SinkBB != Parent)
-              return true;
-            SinkBB = Parent;
-            return false;
+            return UserR->isPhi() || Parent->getEnclosingLoopRegion() ||
+                   Parent->getSinglePredecessor() != LoopRegion;
           }))
         continue;
 
-      // Only sink to dedicated exit blocks of the loop region.
-      if (SinkBB->getSinglePredecessor() != LoopRegion)
-        continue;
-
-      // TODO: This will need to be a check instead of a assert after
-      // conditional branches in vectorized loops are supported.
-      assert(VPDT.properlyDominates(VPBB, SinkBB) &&
-             "Defining block must dominate sink block");
-      // TODO: Clone the recipe if users are on multiple exit paths, instead of
-      // just moving.
-      Def->moveBefore(*SinkBB, SinkBB->getFirstNonPhi());
+      // Compute the users' parent blocks if there are users. Otherwise, sink to
+      // all successor blocks.
+      SmallVector<VPBasicBlock *> SinkBlocks;
+      if (Def->users().empty())
+        append_range(SinkBlocks, map_range(LoopRegion->successors(),
+                                           [](VPBlockBase *SuccBB) {
+                                             return cast<VPBasicBlock>(SuccBB);
+                                           }));
+      else
+        append_range(SinkBlocks, map_range(Def->users(), [](VPUser *U) {
+                       return cast<VPRecipeBase>(U)->getParent();
+                     }));
+
+      for (auto *SinkTo : SinkBlocks) {
+        // This will need to be a check instead of a assert after conditional
+        // branches in vectorized loops are supported.
+        assert(VPDT.properlyDominates(VPBB, SinkTo) &&
+               "Defining block must dominate sink block");
+        auto *Clone = Def->clone();
+        Clone->insertBefore(*SinkTo, SinkTo->getFirstNonPhi());
+        Def->replaceUsesWithIf(Clone, [&SinkTo](VPUser &U, unsigned) {
+          return cast<VPRecipeBase>(U).getParent() == SinkTo;
+        });
+      }
+      // Avoid erroneously bailing on the already-sunk recipe when we get to the
+      // next recipe in post-order and query its users.
+      Def->eraseFromParent();
     }
   }
 }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
index add503acb9420..5a36042ad19b3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
@@ -92,10 +92,10 @@ exit:
 
 define i32 @select_xor_cond(ptr %src, i1 %c.0) {
 ; CHECK: LV: Checking a loop in 'select_xor_cond'
-; CHECK: Cost of 1 for VF 2: WIDEN ir<%sel> = select ir<%c>, ir<false>, ir<%c.0>
-; CHECK: Cost of 1 for VF 4: WIDEN ir<%sel> = select ir<%c>, ir<false>, ir<%c.0>
-; CHECK: Cost of 1 for VF 8: WIDEN ir<%sel> = select ir<%c>, ir<false>, ir<%c.0>
-; CHECK: Cost of 1 for VF 16: WIDEN ir<%sel> = select ir<%c>, ir<false>, ir<%c.0>
+; CHECK: Cost of 1 for VF 2: WIDEN ir<%sel> = select ir<%c>{{.*}}, ir<false>, ir<%c.0>
+; CHECK: Cost of 1 for VF 4: WIDEN ir<%sel> = select ir<%c>{{.*}}, ir<false>, ir<%c.0>
+; CHECK: Cost of 1 for VF 8: WIDEN ir<%sel> = select ir<%c>{{.*}}, ir<false>, ir<%c.0>
+; CHECK: Cost of 1 for VF 16: WIDEN ir<%sel> = select ir<%c>{{.*}}, ir<false>, ir<%c.0>
 ; CHECK: LV: Selecting VF: 4.
 
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
index 0a25460a52fad..3f29bbb22a3e1 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -24,10 +24,6 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x float> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x float>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x float> [[WIDE_LOAD1]], i32 0
-; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp ogt float [[TMP10]], 1.000000e+02
-; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp ogt float [[TMP12]], 1.000000e+02
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP1]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP2]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP15:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
@@ -38,6 +34,10 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = fcmp ogt float [[TMP10]], 1.000000e+02
+; CHECK-NEXT:    [[TMP18:%.*]] = fcmp ogt float [[TMP12]], 1.000000e+02
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP17]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP18]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1600, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]: