[llvm] [VPlan] Handle regions with live-outs and scalar VF when replicating. (PR #186252)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 30 04:23:02 PDT 2026
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/186252
>From ae492f889516272a0d4fe927ea22b08f73d4c49f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 12 Mar 2026 21:02:18 +0000
Subject: [PATCH 1/3] [VPlan] Handle regions with live-outs and scalar VF when
replicating.
Extend intial unrolling of replicate regions
(https://github.com/llvm/llvm-project/pull/170212) to support live-outs,
if the VF is scalar.
This allows adding the logic needed to explicitly unroll, and replacing
VPPredPhiInsts with regular scalar VPPhi, without yet having to worry
about packing values into vector phis. This will be done in a follow-up
change, which means all replicate regions will be fully dissolved.
---
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 6 +++++-
.../Transforms/Vectorize/VPlanTransforms.cpp | 5 +----
llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 21 +++++++++++++++----
3 files changed, 23 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8f937b2f8bc1f..e6385899173d8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1637,7 +1637,11 @@ void VPPhi::execute(VPTransformState &State) {
PHINode *NewPhi = State.Builder.CreatePHI(
State.TypeAnalysis.inferScalarType(this), 2, getName());
unsigned NumIncoming = getNumIncoming();
- if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
+ // Detect header phis: the parent block dominates its second incoming block
+ // (the latch). Non-header phis, e.g. from dissolved replicate regions, don't
+ // have this property.
+ if (NumIncoming == 2 &&
+ State.VPDT.dominates(getParent(), getIncomingBlock(1))) {
// TODO: Fixup all incoming values of header phis once recipes defining them
// are introduced.
NumIncoming = 1;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7e8496a568643..e466e819931f3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -597,10 +597,7 @@ bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) {
auto *ParentRegion = VPBB->getParent();
if (ParentRegion && ParentRegion->getExiting() == VPBB)
ParentRegion->setExiting(PredVPBB);
- for (auto *Succ : to_vector(VPBB->successors())) {
- VPBlockUtils::disconnectBlocks(VPBB, Succ);
- VPBlockUtils::connectBlocks(PredVPBB, Succ);
- }
+ VPBlockUtils::transferSuccessors(VPBB, PredVPBB);
// VPBB is now dead and will be cleaned up when the plan gets destroyed.
}
return !WorkList.empty();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index c0404c92ad641..b3f0953f97fc5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -23,6 +23,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/Intrinsics.h"
using namespace llvm;
@@ -698,6 +699,15 @@ static void convertRecipesInRegionBlocksToSingleScalar(VPlan &Plan, Type *IdxTy,
{BranchOnMask->getOperand(0)},
BranchOnMask->getDebugLoc());
BranchOnMask->eraseFromParent();
+ } else if (auto *PredPhi = dyn_cast<VPPredInstPHIRecipe>(&OldR)) {
+ VPValue *PredOp = PredPhi->getOperand(0);
+ VPValue *PoisonVal = Plan.getOrAddLiveIn(
+ PoisonValue::get(VPTypeAnalysis(Plan).inferScalarType(PredOp)));
+
+ VPPhi *NewPhi = Builder.createScalarPhi({PoisonVal, PredOp},
+ PredPhi->getDebugLoc());
+ PredPhi->replaceAllUsesWith(NewPhi);
+ PredPhi->eraseFromParent();
} else {
assert((isa<VPScalarIVStepsRecipe>(OldR) ||
(isa<VPInstruction>(OldR) &&
@@ -800,7 +810,8 @@ static void replicateReplicateRegionsByVF(VPlan &Plan, ElementCount VF,
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
// Skip regions with live-outs as packing scalar results back into vectors
// is not yet implemented.
- if (Region->isReplicator() && Region->getExitingBasicBlock()->empty())
+ if (Region->isReplicator() &&
+ (!VF.isScalar() && Region->getExitingBasicBlock()->empty()))
ReplicateRegions.push_back(Region);
}
@@ -815,12 +826,14 @@ static void replicateReplicateRegionsByVF(VPlan &Plan, ElementCount VF,
}
void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
- if (Plan.hasScalarVFOnly())
- return;
-
Type *IdxTy = IntegerType::get(
Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32);
+ if (Plan.hasScalarVFOnly()) {
+ replicateReplicateRegionsByVF(Plan, VF, IdxTy);
+ return;
+ }
+
// Visit all VPBBs outside the loop region and directly inside the top-level
// loop region.
auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
>From 3887d09ba31d85823c0056385ac4b7151adb304a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 29 Mar 2026 13:08:23 +0100
Subject: [PATCH 2/3] !fixup address comments, thanks
---
llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 17 +++++--
.../VPlan/dissolve-replicate-regions.ll | 51 +++++++------------
.../VPlan/interleave-and-scalarize-only.ll | 3 +-
.../LoopVectorize/cast-induction.ll | 7 ++-
.../pr45679-fold-tail-by-masking.ll | 10 ++--
.../tail-folding-vectorization-factor-1.ll | 5 +-
6 files changed, 40 insertions(+), 53 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 11a0b29bf7a3b..bc309cf327beb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -670,7 +670,8 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
/// VPReplicateRecipes are converted to single-scalar ones, branch-on-mask is
/// converted into BranchOnCond and extracts are created as needed.
static void convertRecipesInRegionBlocksToSingleScalar(VPlan &Plan, Type *IdxTy,
- VPBlockBase *Entry) {
+ VPBlockBase *Entry,
+ ElementCount VF) {
VPValue *Idx0 = Plan.getZero(IdxTy);
for (VPBlockBase *VPB : vp_depth_first_shallow(Entry)) {
for (VPRecipeBase &OldR : make_early_inc_range(cast<VPBasicBlock>(*VPB))) {
@@ -687,6 +688,10 @@ static void convertRecipesInRegionBlocksToSingleScalar(VPlan &Plan, Type *IdxTy,
vputils::isSingleScalar(Op))
continue;
+ // For scalar VF, operands are already scalar; no extraction needed.
+ if (VF.isScalar())
+ continue;
+
// Extract lane zero from values defined outside the region.
VPValue *Extract = Builder.createNaryOp(Instruction::ExtractElement,
{Op, Idx0}, OldR.getDebugLoc());
@@ -778,7 +783,7 @@ static void dissolveReplicateRegion(VPRegionBlock *Region, ElementCount VF,
// Process the original blocks for lane 0: converting their recipes to
// single-scalar.
- convertRecipesInRegionBlocksToSingleScalar(Plan, IdxTy, FirstLaneEntry);
+ convertRecipesInRegionBlocksToSingleScalar(Plan, IdxTy, FirstLaneEntry, VF);
// Clone converted blocks for remaining lanes and process each in reverse
// order, connecting each lane's Exiting block to the subsequent lane's entry.
@@ -812,10 +817,10 @@ static void replicateReplicateRegionsByVF(VPlan &Plan, ElementCount VF,
SmallVector<VPRegionBlock *> ReplicateRegions;
for (VPRegionBlock *Region : VPBlockUtils::blocksOnly<VPRegionBlock>(
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
- // Skip regions with live-outs as packing scalar results back into vectors
- // is not yet implemented.
+ // Skip regions with live-outs when vectorizing as packing scalar results
+ // back into vectors is not yet implemented.
if (Region->isReplicator() &&
- (!VF.isScalar() && Region->getExitingBasicBlock()->empty()))
+ (VF.isScalar() || Region->getExitingBasicBlock()->empty()))
ReplicateRegions.push_back(Region);
}
@@ -834,6 +839,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32);
if (Plan.hasScalarVFOnly()) {
+ // When Plan is only unrolled by UF, replicating by VF amounts to dissolving
+ // replicate regions.
replicateReplicateRegionsByVF(Plan, VF, IdxTy);
return;
}
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/dissolve-replicate-regions.ll b/llvm/test/Transforms/LoopVectorize/VPlan/dissolve-replicate-regions.ll
index a9b3f7a72dfbc..cb046214051ef 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/dissolve-replicate-regions.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/dissolve-replicate-regions.ll
@@ -23,43 +23,28 @@ define void @predicated_load(i1 %c, ptr %ptr, ptr %dst) {
; SCALAR-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; SCALAR-NEXT: vp<[[VP4:%[0-9]+]]> = SCALAR-STEPS vp<[[VP3]]>, ir<1>, vp<[[VP0]]>
; SCALAR-NEXT: vp<[[VP5:%[0-9]+]]> = SCALAR-STEPS vp<[[VP3]]>, ir<1>, vp<[[VP0]]>, vp<[[VP0]]>
-; SCALAR-NEXT: Successor(s): pred.load
+; SCALAR-NEXT: EMIT branch-on-cond ir<%c>
+; SCALAR-NEXT: Successor(s): pred.load.if, pred.load.continue
; SCALAR-EMPTY:
-; SCALAR-NEXT: <xVFxUF> pred.load: {
-; SCALAR-NEXT: pred.load.entry:
-; SCALAR-NEXT: BRANCH-ON-MASK ir<%c>
-; SCALAR-NEXT: Successor(s): pred.load.if, pred.load.continue
+; SCALAR-NEXT: pred.load.if:
+; SCALAR-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<[[VP4]]>
+; SCALAR-NEXT: CLONE ir<%lv> = load ir<%gep>
+; SCALAR-NEXT: Successor(s): pred.load.continue
; SCALAR-EMPTY:
-; SCALAR-NEXT: pred.load.if:
-; SCALAR-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<[[VP4]]>
-; SCALAR-NEXT: CLONE ir<%lv> = load ir<%gep>
-; SCALAR-NEXT: Successor(s): pred.load.continue
+; SCALAR-NEXT: pred.load.continue:
+; SCALAR-NEXT: EMIT-SCALAR vp<[[VP7:%[0-9]+]]> = phi [ ir<poison>, vector.body ], [ ir<%lv>, pred.load.if ]
+; SCALAR-NEXT: EMIT branch-on-cond ir<%c>
+; SCALAR-NEXT: Successor(s): pred.load.if, pred.load.continue
; SCALAR-EMPTY:
-; SCALAR-NEXT: pred.load.continue:
-; SCALAR-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[VP6:%[0-9]+]]> = ir<%lv>
-; SCALAR-NEXT: No successors
-; SCALAR-NEXT: }
-; SCALAR-NEXT: Successor(s): pred.load
+; SCALAR-NEXT: pred.load.if:
+; SCALAR-NEXT: CLONE ir<%gep>.1 = getelementptr ir<%ptr>, vp<[[VP5]]>
+; SCALAR-NEXT: CLONE ir<%lv>.1 = load ir<%gep>.1
+; SCALAR-NEXT: Successor(s): pred.load.continue
; SCALAR-EMPTY:
-; SCALAR-NEXT: <xVFxUF> pred.load: {
-; SCALAR-NEXT: pred.load.entry:
-; SCALAR-NEXT: BRANCH-ON-MASK ir<%c>
-; SCALAR-NEXT: Successor(s): pred.load.if, pred.load.continue
-; SCALAR-EMPTY:
-; SCALAR-NEXT: pred.load.if:
-; SCALAR-NEXT: CLONE ir<%gep>.1 = getelementptr ir<%ptr>, vp<[[VP5]]>
-; SCALAR-NEXT: CLONE ir<%lv>.1 = load ir<%gep>.1
-; SCALAR-NEXT: Successor(s): pred.load.continue
-; SCALAR-EMPTY:
-; SCALAR-NEXT: pred.load.continue:
-; SCALAR-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[VP7:%[0-9]+]]> = ir<%lv>.1
-; SCALAR-NEXT: No successors
-; SCALAR-NEXT: }
-; SCALAR-NEXT: Successor(s): if.then.0
-; SCALAR-EMPTY:
-; SCALAR-NEXT: if.then.0:
-; SCALAR-NEXT: BLEND ir<%pred.val> = ir<0> vp<%6>/ir<%c>
-; SCALAR-NEXT: BLEND ir<%pred.val>.1 = ir<0> vp<%7>/ir<%c>
+; SCALAR-NEXT: pred.load.continue:
+; SCALAR-NEXT: EMIT-SCALAR vp<[[VP9:%[0-9]+]]> = phi [ ir<poison>, pred.load.continue ], [ ir<%lv>.1, pred.load.if ]
+; SCALAR-NEXT: BLEND ir<%pred.val> = ir<0> vp<%7>/ir<%c>
+; SCALAR-NEXT: BLEND ir<%pred.val>.1 = ir<0> vp<%9>/ir<%c>
; SCALAR-NEXT: CLONE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[VP4]]>
; SCALAR-NEXT: CLONE ir<%gep.dst>.1 = getelementptr ir<%dst>, vp<[[VP5]]>
; SCALAR-NEXT: CLONE store ir<%pred.val>, ir<%gep.dst>
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/VPlan/interleave-and-scalarize-only.ll
index a641e7ef54de7..0a17319c04d35 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/interleave-and-scalarize-only.ll
@@ -136,9 +136,8 @@ define void @test_scalarize_with_branch_cond(ptr %src, ptr %dst) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %pred.store.continue4 ]
; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i1
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i1 false, [[TMP0]]
-; CHECK-NEXT: [[INDUCTION2:%.*]] = add i1 [[OFFSET_IDX]], false
; CHECK-NEXT: [[INDUCTION3:%.*]] = add i1 [[OFFSET_IDX]], true
-; CHECK-NEXT: br i1 [[INDUCTION2]], label %pred.store.if, label %pred.store.continue
+; CHECK-NEXT: br i1 [[OFFSET_IDX]], label %pred.store.if, label %pred.store.continue
; CHECK: pred.store.if:
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr %src, i64 [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/cast-induction.ll b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
index e93b84c2f5603..e5069d97f3117 100644
--- a/llvm/test/Transforms/LoopVectorize/cast-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
@@ -273,15 +273,14 @@ define void @cast_induction_tail_folding(ptr %A) {
; IC2-NEXT: br label %[[VECTOR_BODY:.*]]
; IC2: [[VECTOR_BODY]]:
; IC2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ]
-; IC2-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; IC2-NEXT: [[INDEX1:%.*]] = add i32 [[INDEX]], 1
-; IC2-NEXT: [[TMP2:%.*]] = icmp ule i32 [[TMP0]], 2
+; IC2-NEXT: [[TMP2:%.*]] = icmp ule i32 [[INDEX]], 2
; IC2-NEXT: [[TMP3:%.*]] = icmp ule i32 [[INDEX1]], 2
; IC2-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; IC2: [[PRED_STORE_IF]]:
-; IC2-NEXT: [[TMP4:%.*]] = sext i32 [[TMP0]] to i64
+; IC2-NEXT: [[TMP4:%.*]] = sext i32 [[INDEX]] to i64
; IC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
-; IC2-NEXT: store i32 [[TMP0]], ptr [[TMP5]], align 4
+; IC2-NEXT: store i32 [[INDEX]], ptr [[TMP5]], align 4
; IC2-NEXT: br label %[[PRED_STORE_CONTINUE]]
; IC2: [[PRED_STORE_CONTINUE]]:
; IC2-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
index 610da05dbbab7..c07134229322e 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
@@ -118,17 +118,16 @@ define void @pr45679(ptr %A) {
; VF1UF4-NEXT: br label [[VECTOR_BODY:%.*]]
; VF1UF4: vector.body:
; VF1UF4-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; VF1UF4-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; VF1UF4-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1
; VF1UF4-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2
; VF1UF4-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3
-; VF1UF4-NEXT: [[TMP4:%.*]] = icmp ule i32 [[TMP0]], 13
+; VF1UF4-NEXT: [[TMP4:%.*]] = icmp ule i32 [[INDEX]], 13
; VF1UF4-NEXT: [[TMP5:%.*]] = icmp ule i32 [[TMP1]], 13
; VF1UF4-NEXT: [[TMP6:%.*]] = icmp ule i32 [[TMP2]], 13
; VF1UF4-NEXT: [[TMP7:%.*]] = icmp ule i32 [[TMP3]], 13
; VF1UF4-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; VF1UF4: pred.store.if:
-; VF1UF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
+; VF1UF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
; VF1UF4-NEXT: store i32 13, ptr [[TMP8]], align 1
; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE]]
; VF1UF4: pred.store.continue:
@@ -289,17 +288,16 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
; VF1UF4-NEXT: br label [[VECTOR_BODY:%.*]]
; VF1UF4: vector.body:
; VF1UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; VF1UF4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; VF1UF4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
; VF1UF4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
; VF1UF4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; VF1UF4-NEXT: [[TMP4:%.*]] = icmp ule i64 [[TMP0]], 13
+; VF1UF4-NEXT: [[TMP4:%.*]] = icmp ule i64 [[INDEX]], 13
; VF1UF4-NEXT: [[TMP5:%.*]] = icmp ule i64 [[TMP1]], 13
; VF1UF4-NEXT: [[TMP6:%.*]] = icmp ule i64 [[TMP2]], 13
; VF1UF4-NEXT: [[TMP7:%.*]] = icmp ule i64 [[TMP3]], 13
; VF1UF4-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; VF1UF4: pred.store.if:
-; VF1UF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
+; VF1UF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
; VF1UF4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP8]], align 8
; VF1UF4-NEXT: store i64 [[TMP9]], ptr [[B:%.*]], align 8
; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE]]
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
index c6bcd708c0fce..57e0e9b63b8de 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
@@ -17,17 +17,16 @@ define void @VF1-VPlanExe(ptr %dst) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ule i64 [[TMP0]], 14
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ule i64 [[INDEX]], 14
; CHECK-NEXT: [[TMP5:%.*]] = icmp ule i64 [[TMP1]], 14
; CHECK-NEXT: [[TMP6:%.*]] = icmp ule i64 [[TMP2]], 14
; CHECK-NEXT: [[TMP7:%.*]] = icmp ule i64 [[TMP3]], 14
; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
; CHECK-NEXT: store i32 0, ptr [[TMP8]], align 4
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
; CHECK: pred.store.continue:
>From c7f2029b557181c882d37af320c935838327d82d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 30 Mar 2026 11:36:44 +0100
Subject: [PATCH 3/3] !fixup address latest comments, thanks
---
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 6 +--
llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 39 ++++++++++---------
2 files changed, 22 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e6385899173d8..7eefd77045050 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1638,12 +1638,10 @@ void VPPhi::execute(VPTransformState &State) {
State.TypeAnalysis.inferScalarType(this), 2, getName());
unsigned NumIncoming = getNumIncoming();
// Detect header phis: the parent block dominates its second incoming block
- // (the latch). Non-header phis, e.g. from dissolved replicate regions, don't
- // have this property.
+ // (the latch). Those IR incoming values have not been generated yet and need
+ // to be added after they have been executed.
if (NumIncoming == 2 &&
State.VPDT.dominates(getParent(), getIncomingBlock(1))) {
- // TODO: Fixup all incoming values of header phis once recipes defining them
- // are introduced.
NumIncoming = 1;
}
for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index bc309cf327beb..50b05a9e9c0fd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -673,29 +673,30 @@ static void convertRecipesInRegionBlocksToSingleScalar(VPlan &Plan, Type *IdxTy,
VPBlockBase *Entry,
ElementCount VF) {
VPValue *Idx0 = Plan.getZero(IdxTy);
+ VPTypeAnalysis TypeInfo(Plan);
for (VPBlockBase *VPB : vp_depth_first_shallow(Entry)) {
for (VPRecipeBase &OldR : make_early_inc_range(cast<VPBasicBlock>(*VPB))) {
VPBuilder Builder(&OldR);
assert(!match(&OldR, m_ExtractElement(m_VPValue(), m_VPValue())) &&
"must not contain extracts before conversion");
- for (const auto &[I, Op] : enumerate(OldR.operands())) {
- // Skip operands that don't need extraction: values defined in the
- // same block (already scalar), or values that are already single
- // scalars.
- auto *DefR = Op->getDefiningRecipe();
- if ((isa_and_present<VPScalarIVStepsRecipe>(DefR) &&
- DefR->getParent() == VPB) ||
- vputils::isSingleScalar(Op))
- continue;
- // For scalar VF, operands are already scalar; no extraction needed.
- if (VF.isScalar())
- continue;
-
- // Extract lane zero from values defined outside the region.
- VPValue *Extract = Builder.createNaryOp(Instruction::ExtractElement,
- {Op, Idx0}, OldR.getDebugLoc());
- OldR.setOperand(I, Extract);
+ // For scalar VF, operands are already scalar; no extraction needed.
+ if (!VF.isScalar()) {
+ for (const auto &[I, Op] : enumerate(OldR.operands())) {
+ // Skip operands that don't need extraction: values defined in the
+ // same block (already scalar), or values that are already single
+ // scalars.
+ auto *DefR = Op->getDefiningRecipe();
+ if ((isa_and_present<VPScalarIVStepsRecipe>(DefR) &&
+ DefR->getParent() == VPB) ||
+ vputils::isSingleScalar(Op))
+ continue;
+
+ // Extract lane zero from values defined outside the region.
+ VPValue *Extract = Builder.createNaryOp(
+ Instruction::ExtractElement, {Op, Idx0}, OldR.getDebugLoc());
+ OldR.setOperand(I, Extract);
+ }
}
if (auto *RepR = dyn_cast<VPReplicateRecipe>(&OldR)) {
@@ -713,8 +714,8 @@ static void convertRecipesInRegionBlocksToSingleScalar(VPlan &Plan, Type *IdxTy,
BranchOnMask->eraseFromParent();
} else if (auto *PredPhi = dyn_cast<VPPredInstPHIRecipe>(&OldR)) {
VPValue *PredOp = PredPhi->getOperand(0);
- VPValue *PoisonVal = Plan.getOrAddLiveIn(
- PoisonValue::get(VPTypeAnalysis(Plan).inferScalarType(PredOp)));
+ Type *PredTy = TypeInfo.inferScalarType(PredOp);
+ VPValue *PoisonVal = Plan.getOrAddLiveIn(PoisonValue::get(PredTy));
VPPhi *NewPhi = Builder.createScalarPhi({PoisonVal, PredOp},
PredPhi->getDebugLoc());
More information about the llvm-commits
mailing list