[llvm-branch-commits] [llvm] [VPlan] Scalarize to first-lane-only directly on VPlan (PR #184267)

Mon Mar 2 16:01:29 PST 2026

https://github.com/eas created https://github.com/llvm/llvm-project/pull/184267

This is needed to enable subsequent https://github.com/llvm/llvm-project/pull/182595.

I don't think we can fully port all scalarization logic from the legacy path to VPlan-based right now because that would require us to introduce interleave groups much earlier in VPlan pipeline, and without that we can't really `assert` this new decision matches the previous CM-based one. And without those `assert`s it's really hard to ensure we properly port all the previous logic.

As such, I decided just to implement something much simpler that would be enough for #182595. However, we perform this transformation before delegating to the old CM-based decision, so it **is** effective immediately and taking precedence even for consecutive loads/stores right away.

Depends on https://github.com/llvm/llvm-project/pull/182592 but is stacked on top of https://github.com/llvm/llvm-project/pull/182594 to enable linear stacking for https://github.com/llvm/llvm-project/pull/182595.

>From aa4bba35aae66619c5103278f4294e2a5e695831 Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Fri, 27 Feb 2026 11:02:19 -0800
Subject: [PATCH] [VPlan] Scalarize to first-lane-only directly on VPlan

This is needed to enable subsequent https://github.com/llvm/llvm-project/pull/182595.

I don't think we can fully port all scalarization logic from the legacy
path to VPlan-based right now because that would require us to introduce
interleave groups much earlier in VPlan pipeline, and without that we
can't really `assert` this new decision matches the previous CM-based
one. And without those `assert`s it's really hard to ensure we properly
port all the previous logic.

As such, I decided just to implement something much simpler that would
be enough for #182595. However, we perform this transformation before
delegating to the old CM-based decision, so it **is** effective
immediately and taking precedence even for consecutive loads/stores
right away.

Depends on https://github.com/llvm/llvm-project/pull/182592 but is stacked on
top of https://github.com/llvm/llvm-project/pull/182594 to enable linear
stacking for https://github.com/llvm/llvm-project/pull/182595.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  3 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 65 +++++++++++++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |  6 ++
 .../VPlan/vplan-print-after-all.ll            |  1 +
 .../X86/drop-poison-generating-flags.ll       |  4 +-
 llvm/test/Transforms/LoopVectorize/pr37248.ll |  2 +-
 .../runtime-check-needed-but-empty.ll         |  2 +-
 7 files changed, 79 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 351bbe84d91b1..118890e49868c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8245,6 +8245,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::makeMemOpWideningDecisions, *Plan,
                            Range, RecipeBuilder, CostCtx, *CM.Legal);
 
+  RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::makeScalarizationDecisions, *Plan,
+                           Range, RecipeBuilder);
+
   // Now process all other blocks and instructions.
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
     // Convert input VPInstructions to widened recipes.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 822f3ee6dbedb..5d960a813241e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -6395,3 +6395,68 @@ void VPlanTransforms::makeMemOpWideningDecisions(
                              return ReplaceWith(VPI, Recipe);
                            });
 }
+
+void VPlanTransforms::makeScalarizationDecisions(
+    VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
+  if (LoopVectorizationPlanner::getDecisionAndClampRange(
+          [&](ElementCount VF) { return VF.isScalar(); }, Range))
+    return;
+
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           post_order<VPBlockShallowTraversalWrapper<VPBlockBase *>>(
+               HeaderVPBB))) {
+    for (VPRecipeBase &R :
+         make_early_inc_range(make_range(VPBB->rbegin(), VPBB->rend()))) {
+      auto *VPI = dyn_cast<VPInstruction>(&R);
+      if (!VPI)
+        continue;
+
+      auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());
+      if (!I)
+        // Wouldn't be able to create a `VPReplicateRecipe` anyway.
+        continue;
+
+      bool CanTransformToFirstLaneOnly = [&]() {
+        if (VPI->mayHaveSideEffects())
+          return false;
+
+        if (is_contained({Instruction::SDiv, Instruction::UDiv,
+                          Instruction::SRem, Instruction::URem},
+                         VPI->getOpcode()) &&
+            VPI->getMask())
+          return false;
+
+        // Avoid rewriting IV increment as that interferes with
+        // `removeRedundantCanonicalIVs`.
+        if (VPI->getOpcode() == Instruction::Add &&
+            any_of(VPI->operands(),
+                   [&](auto *Op) { return isa<VPWidenInductionRecipe>(Op); }))
+          return false;
+
+        if (!all_of(VPI->users(), [&](auto *U) {
+              // TODO: This "ScalarCast" is bonkers...
+              if (VPI->isScalarCast() && isa<VPWidenGEPRecipe>(U))
+                return false;
+
+              return U->usesFirstLaneOnly(VPI);
+            }))
+          return false;
+
+        return true;
+      }();
+
+      if (CanTransformToFirstLaneOnly) {
+        auto *Recipe =
+            new VPReplicateRecipe(I, VPI->operandsWithoutMask(), true, nullptr,
+                                  *VPI, *VPI, VPI->getDebugLoc());
+        Recipe->insertBefore(VPI);
+        VPI->replaceAllUsesWith(Recipe);
+        RecipeBuilder.setRecipe(I, Recipe);
+        VPI->eraseFromParent();
+        continue;
+      }
+    }
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index a601de38a53cd..999f8e93e2b05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -485,6 +485,12 @@ struct VPlanTransforms {
                                          VPRecipeBuilder &RecipeBuilder,
                                          VPCostContext &CostCtx,
                                          LoopVectorizationLegality &Legal);
+
+  /// Make VPlan-based scalarization decision prior to delegating to the ones
+  /// made by the legacy CM. Only transforms "usesFirstLaneOnly` def-use chains
+  /// enabled by prior widening of consecutive memory operations for now.
+  static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range,
+                                         VPRecipeBuilder &RecipeBuilder);
 };
 
 /// A helper function that returns true if the given type is irregular. The
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
index 706d91260a70e..2c14456857a31 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
@@ -9,6 +9,7 @@
 ; CHECK: VPlan for loop in 'foo' after scalarizeMemOpsWithIrregularTypes
 ; CHECK: VPlan for loop in 'foo' after delegateMemOpWideningToLegacyCM
 ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::makeMemOpWideningDecisions
+; CHECK: VPlan for loop in 'foo' after VPlanTransforms::makeScalarizationDecisions
 ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::clearReductionWrapFlags
 ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::optimizeFindIVReductions
 ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::handleMultiUseReductions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index f20bf95af4b58..62805beeb7231 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -428,9 +428,9 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP3]], <4 x i1> [[TMP0]], <4 x double> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/pr37248.ll b/llvm/test/Transforms/LoopVectorize/pr37248.ll
index 98da110a44e8c..bfc62fb208466 100644
--- a/llvm/test/Transforms/LoopVectorize/pr37248.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr37248.ll
@@ -42,7 +42,6 @@ define void @f1(ptr noalias %b, i1 %c, i32 %start) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[START]], [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
@@ -55,6 +54,7 @@ define void @f1(ptr noalias %b, i1 %c, i32 %start) {
 ; CHECK-NEXT:    store i32 10, ptr [[B]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE3]]
 ; CHECK:       [[PRED_STORE_CONTINUE3]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i16], ptr @a, i16 0, i16 [[TMP12]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i64 -1
 ; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP17]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
index dd7a8a87a921b..7548a783fb4dd 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
@@ -13,12 +13,12 @@ define void @test(ptr %A, i32 %x) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    store <4 x float> [[WIDE_LOAD]], ptr [[TMP9]], align 4