[llvm-branch-commits] [llvm] [VPlan] Scalarize to first-lane-only directly on VPlan (PR #184267)
Andrei Elovikov via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Mar 2 16:01:29 PST 2026
https://github.com/eas created https://github.com/llvm/llvm-project/pull/184267
This is needed to enable subsequent https://github.com/llvm/llvm-project/pull/182595.
I don't think we can fully port all scalarization logic from the legacy path to VPlan-based right now because that would require us to introduce interleave groups much earlier in VPlan pipeline, and without that we can't really `assert` this new decision matches the previous CM-based one. And without those `assert`s it's really hard to ensure we properly port all the previous logic.
As such, I decided just to implement something much simpler that would be enough for #182595. However, we perform this transformation before delegating to the old CM-based decision, so it **is** effective immediately and taking precedence even for consecutive loads/stores right away.
Depends on https://github.com/llvm/llvm-project/pull/182592 but is stacked on top of https://github.com/llvm/llvm-project/pull/182594 to enable linear stacking for https://github.com/llvm/llvm-project/pull/182595.
>From aa4bba35aae66619c5103278f4294e2a5e695831 Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Fri, 27 Feb 2026 11:02:19 -0800
Subject: [PATCH] [VPlan] Scalarize to first-lane-only directly on VPlan
This is needed to enable subsequent https://github.com/llvm/llvm-project/pull/182595.
I don't think we can fully port all scalarization logic from the legacy
path to VPlan-based right now because that would require us to introduce
interleave groups much earlier in VPlan pipeline, and without that we
can't really `assert` this new decision matches the previous CM-based
one. And without those `assert`s it's really hard to ensure we properly
port all the previous logic.
As such, I decided just to implement something much simpler that would
be enough for #182595. However, we perform this transformation before
delegating to the old CM-based decision, so it **is** effective
immediately and taking precedence even for consecutive loads/stores
right away.
Depends on https://github.com/llvm/llvm-project/pull/182592 but is stacked on
top of https://github.com/llvm/llvm-project/pull/182594 to enable linear
stacking for https://github.com/llvm/llvm-project/pull/182595.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 3 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 65 +++++++++++++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 6 ++
.../VPlan/vplan-print-after-all.ll | 1 +
.../X86/drop-poison-generating-flags.ll | 4 +-
llvm/test/Transforms/LoopVectorize/pr37248.ll | 2 +-
.../runtime-check-needed-but-empty.ll | 2 +-
7 files changed, 79 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 351bbe84d91b1..118890e49868c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8245,6 +8245,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::makeMemOpWideningDecisions, *Plan,
Range, RecipeBuilder, CostCtx, *CM.Legal);
+ RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::makeScalarizationDecisions, *Plan,
+ Range, RecipeBuilder);
+
// Now process all other blocks and instructions.
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
// Convert input VPInstructions to widened recipes.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 822f3ee6dbedb..5d960a813241e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -6395,3 +6395,68 @@ void VPlanTransforms::makeMemOpWideningDecisions(
return ReplaceWith(VPI, Recipe);
});
}
+
+void VPlanTransforms::makeScalarizationDecisions(
+ VPlan &Plan, VFRange &Range, VPRecipeBuilder &RecipeBuilder) {
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) { return VF.isScalar(); }, Range))
+ return;
+
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ post_order<VPBlockShallowTraversalWrapper<VPBlockBase *>>(
+ HeaderVPBB))) {
+ for (VPRecipeBase &R :
+ make_early_inc_range(make_range(VPBB->rbegin(), VPBB->rend()))) {
+ auto *VPI = dyn_cast<VPInstruction>(&R);
+ if (!VPI)
+ continue;
+
+ auto *I = cast_or_null<Instruction>(VPI->getUnderlyingValue());
+ if (!I)
+ // Wouldn't be able to create a `VPReplicateRecipe` anyway.
+ continue;
+
+ bool CanTransformToFirstLaneOnly = [&]() {
+ if (VPI->mayHaveSideEffects())
+ return false;
+
+ if (is_contained({Instruction::SDiv, Instruction::UDiv,
+ Instruction::SRem, Instruction::URem},
+ VPI->getOpcode()) &&
+ VPI->getMask())
+ return false;
+
+ // Avoid rewriting IV increment as that interferes with
+ // `removeRedundantCanonicalIVs`.
+ if (VPI->getOpcode() == Instruction::Add &&
+ any_of(VPI->operands(),
+ [&](auto *Op) { return isa<VPWidenInductionRecipe>(Op); }))
+ return false;
+
+ if (!all_of(VPI->users(), [&](auto *U) {
+ // TODO: This "ScalarCast" is bonkers...
+ if (VPI->isScalarCast() && isa<VPWidenGEPRecipe>(U))
+ return false;
+
+ return U->usesFirstLaneOnly(VPI);
+ }))
+ return false;
+
+ return true;
+ }();
+
+ if (CanTransformToFirstLaneOnly) {
+ auto *Recipe =
+ new VPReplicateRecipe(I, VPI->operandsWithoutMask(), true, nullptr,
+ *VPI, *VPI, VPI->getDebugLoc());
+ Recipe->insertBefore(VPI);
+ VPI->replaceAllUsesWith(Recipe);
+ RecipeBuilder.setRecipe(I, Recipe);
+ VPI->eraseFromParent();
+ continue;
+ }
+ }
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index a601de38a53cd..999f8e93e2b05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -485,6 +485,12 @@ struct VPlanTransforms {
VPRecipeBuilder &RecipeBuilder,
VPCostContext &CostCtx,
LoopVectorizationLegality &Legal);
+
+ /// Make VPlan-based scalarization decision prior to delegating to the ones
+ /// made by the legacy CM. Only transforms "usesFirstLaneOnly` def-use chains
+ /// enabled by prior widening of consecutive memory operations for now.
+ static void makeScalarizationDecisions(VPlan &Plan, VFRange &Range,
+ VPRecipeBuilder &RecipeBuilder);
};
/// A helper function that returns true if the given type is irregular. The
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
index 706d91260a70e..2c14456857a31 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
@@ -9,6 +9,7 @@
; CHECK: VPlan for loop in 'foo' after scalarizeMemOpsWithIrregularTypes
; CHECK: VPlan for loop in 'foo' after delegateMemOpWideningToLegacyCM
; CHECK: VPlan for loop in 'foo' after VPlanTransforms::makeMemOpWideningDecisions
+; CHECK: VPlan for loop in 'foo' after VPlanTransforms::makeScalarizationDecisions
; CHECK: VPlan for loop in 'foo' after VPlanTransforms::clearReductionWrapFlags
; CHECK: VPlan for loop in 'foo' after VPlanTransforms::optimizeFindIVReductions
; CHECK: VPlan for loop in 'foo' after VPlanTransforms::handleMultiUseReductions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index f20bf95af4b58..62805beeb7231 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -428,9 +428,9 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 {
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDEX]] to i32
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i32> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64>
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP2]]
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP3]], <4 x i1> [[TMP0]], <4 x double> poison)
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/pr37248.ll b/llvm/test/Transforms/LoopVectorize/pr37248.ll
index 98da110a44e8c..bfc62fb208466 100644
--- a/llvm/test/Transforms/LoopVectorize/pr37248.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr37248.ll
@@ -42,7 +42,6 @@ define void @f1(ptr noalias %b, i1 %c, i32 %start) {
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[START]], [[INDEX]]
-; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
@@ -55,6 +54,7 @@ define void @f1(ptr noalias %b, i1 %c, i32 %start) {
; CHECK-NEXT: store i32 10, ptr [[B]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE3]]
; CHECK: [[PRED_STORE_CONTINUE3]]:
+; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i16], ptr @a, i16 0, i16 [[TMP12]]
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP15]], i64 -1
; CHECK-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP17]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
index dd7a8a87a921b..7548a783fb4dd 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
@@ -13,12 +13,12 @@ define void @test(ptr %A, i32 %x) {
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP0]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
; CHECK-NEXT: store <4 x float> [[WIDE_LOAD]], ptr [[TMP9]], align 4
More information about the llvm-branch-commits
mailing list