[llvm] [VPlan] Replace VPRegionBlock with explicit CFG before execute (NFCI). (PR #117506)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Sat May 17 14:49:37 PDT 2025
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/117506
>From c274eea4e968a1c9ac7aa8288c04f8cd1e8ec2ee Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 18 Mar 2025 22:37:13 +0000
Subject: [PATCH 1/5] [VPlan] Replace VPRegionBlock with explicit CFG before
execute (NFCI).
!fixup update more tests.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 31 ++-
llvm/lib/Transforms/Vectorize/VPlan.cpp | 191 ++++++++++--------
llvm/lib/Transforms/Vectorize/VPlan.h | 7 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 13 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 16 ++
.../Transforms/Vectorize/VPlanTransforms.h | 5 +-
.../AArch64/epilog-iv-select-cmp.ll | 12 +-
.../AArch64/reduction-recurrence-costs-sve.ll | 10 +-
.../LoopVectorize/AArch64/vplan-printing.ll | 37 ++--
.../RISCV/riscv-vector-reverse.ll | 74 ++++---
.../RISCV/vplan-vp-select-intrinsics.ll | 51 +++--
.../LoopVectorize/vplan-predicate-switch.ll | 113 +++++------
12 files changed, 295 insertions(+), 265 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0b8b0c7dcdfc9..ba9f01e2a330a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2760,6 +2760,15 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
}
+static VPBasicBlock *getHeaderForMainVectorLoop(VPlan &Plan,
+ VPDominatorTree &VPDT) {
+ return find_singleton<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getEntry()), [&VPDT](VPBlockBase *VPB, bool) {
+ auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
+ return VPBB && VPBB->isHeader(VPDT) ? VPBB : nullptr;
+ });
+}
+
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Fix widened non-induction PHIs by setting up the PHI operands.
if (EnableVPlanNativePath)
@@ -2778,13 +2787,13 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
PSE.getSE()->forgetLoop(OrigLoop);
PSE.getSE()->forgetBlockAndLoopDispositions();
- // Don't apply optimizations below when no vector region remains, as they all
- // require a vector loop at the moment.
- if (!State.Plan->getVectorLoopRegion())
+ // Don't apply optimizations below when no vector loop remains, as they all
+ // require one at the moment.
+ VPBasicBlock *HeaderVPBB =
+ getHeaderForMainVectorLoop(*State.Plan, State.VPDT);
+ if (!HeaderVPBB)
return;
- VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
- VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
// Remove redundant induction instructions.
@@ -2809,7 +2818,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
}
void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
- auto Iter = vp_depth_first_deep(Plan.getEntry());
+ auto Iter = vp_depth_first_shallow(Plan.getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
for (VPRecipeBase &P : VPBB->phis()) {
VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
@@ -7799,6 +7808,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan, BestVF,
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
VPlanTransforms::removeDeadRecipes(BestVPlan);
+
+ VPBasicBlock *MiddleVPBB =
+ BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr;
VPlanTransforms::convertToConcreteRecipes(BestVPlan,
*Legal->getWidestInductionType());
@@ -7894,14 +7906,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
- if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
+ VPBasicBlock *HeaderVPBB = getHeaderForMainVectorLoop(BestVPlan, State.VPDT);
+ if (HeaderVPBB) {
MDNode *OrigLoopID = OrigLoop->getLoopID();
std::optional<MDNode *> VectorizedLoopID =
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupVectorized});
- VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
if (VectorizedLoopID) {
L->setLoopID(*VectorizedLoopID);
@@ -7947,8 +7959,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
ILV.printDebugTracesAtEnd();
// 4. Adjust branch weight of the branch in the middle block.
- if (BestVPlan.getVectorLoopRegion()) {
- auto *MiddleVPBB = BestVPlan.getMiddleBlock();
+ if (HeaderVPBB) {
auto *MiddleTerm =
cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
if (MiddleTerm->isConditional() &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 167aff737d3fd..6d35862ef6ad4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -207,6 +207,11 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
return Parent->getEnclosingBlockWithPredecessors();
}
+bool VPBasicBlock::isHeader(const VPDominatorTree &VPDT) const {
+ return getPredecessors().size() == 2 &&
+ VPDT.dominates(this, getPredecessors()[1]);
+}
+
VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
iterator It = begin();
while (It != end() && It->isPhi())
@@ -424,7 +429,9 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
- BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
+ BasicBlock *PredBB = CFG.VPBB2IRBB.lookup(PredVPBB);
+ if (!PredBB)
+ continue;
assert(PredBB && "Predecessor basic-block not found building successor.");
auto *PredBBTerminator = PredBB->getTerminator();
@@ -432,6 +439,8 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
auto *TermBr = dyn_cast<BranchInst>(PredBBTerminator);
if (isa<UnreachableInst>(PredBBTerminator)) {
+ if (PredVPSuccessors.size() == 2)
+ continue;
assert(PredVPSuccessors.size() == 1 &&
"Predecessor ending w/o branch must have single successor.");
DebugLoc DL = PredBBTerminator->getDebugLoc();
@@ -487,11 +496,25 @@ void VPBasicBlock::execute(VPTransformState *State) {
bool Replica = bool(State->Lane);
BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
+ if (isHeader(State->VPDT)) {
+ // Create and register the new vector loop.
+ Loop *PrevParentLoop = State->CurrentParentLoop;
+ State->CurrentParentLoop = State->LI->AllocateLoop();
+
+ // Insert the new loop into the loop nest and register the new basic blocks
+ // before calling any utilities such as SCEV that require valid LoopInfo.
+ if (PrevParentLoop)
+ PrevParentLoop->addChildLoop(State->CurrentParentLoop);
+ else
+ State->LI->addTopLevelLoop(State->CurrentParentLoop);
+ }
+
auto IsReplicateRegion = [](VPBlockBase *BB) {
auto *R = dyn_cast_or_null<VPRegionBlock>(BB);
- return R && R->isReplicator();
+ assert((!R || R->isReplicator()) &&
+ "only replicate region blocks should remain");
+ return R;
};
-
// 1. Create an IR basic block.
if ((Replica && this == getParent()->getEntry()) ||
IsReplicateRegion(getSingleHierarchicalPredecessor())) {
@@ -514,6 +537,14 @@ void VPBasicBlock::execute(VPTransformState *State) {
// 2. Fill the IR basic block with IR instructions.
executeRecipes(State, NewBB);
+
+ // If this block is a latch, update CurrentParentLoop.
+ if (any_of(getSuccessors(), [State, this](VPBlockBase *Succ) {
+ auto *VPBB = dyn_cast<VPBasicBlock>(Succ);
+ return VPBB && VPBB->isHeader(State->VPDT) &&
+ State->VPDT.dominates(Succ, this);
+ }))
+ State->CurrentParentLoop = State->CurrentParentLoop->getParentLoop();
}
VPBasicBlock *VPBasicBlock::clone() {
@@ -725,35 +756,13 @@ VPRegionBlock *VPRegionBlock::clone() {
}
void VPRegionBlock::execute(VPTransformState *State) {
- ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
- RPOT(Entry);
-
- if (!isReplicator()) {
- // Create and register the new vector loop.
- Loop *PrevParentLoop = State->CurrentParentLoop;
- State->CurrentParentLoop = State->LI->AllocateLoop();
-
- // Insert the new loop into the loop nest and register the new basic blocks
- // before calling any utilities such as SCEV that require valid LoopInfo.
- if (PrevParentLoop)
- PrevParentLoop->addChildLoop(State->CurrentParentLoop);
- else
- State->LI->addTopLevelLoop(State->CurrentParentLoop);
-
- // Visit the VPBlocks connected to "this", starting from it.
- for (VPBlockBase *Block : RPOT) {
- LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
- Block->execute(State);
- }
-
- State->CurrentParentLoop = PrevParentLoop;
- return;
- }
-
+ assert(isReplicator() &&
+ "Loop regions should have been lowered to plain CFG");
assert(!State->Lane && "Replicating a Region with non-null instance.");
-
- // Enter replicating mode.
assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
+
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ Entry);
State->Lane = VPLane(0);
for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
++Lane) {
@@ -847,6 +856,22 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPRegionBlock::removeRegion() {
+ auto *Header = cast<VPBasicBlock>(getEntry());
+ VPBlockBase *Preheader = getSinglePredecessor();
+ auto *Exiting = cast<VPBasicBlock>(getExiting());
+
+ VPBlockBase *Middle = getSingleSuccessor();
+ VPBlockUtils::disconnectBlocks(Preheader, this);
+ VPBlockUtils::disconnectBlocks(this, Middle);
+
+ for (VPBlockBase *VPB : vp_depth_first_shallow(Entry))
+ VPB->setParent(nullptr);
+
+ VPBlockUtils::connectBlocks(Preheader, Header);
+ VPBlockUtils::connectBlocks(Exiting, Middle);
+}
+
VPlan::VPlan(Loop *L) {
setEntry(createVPIRBasicBlock(L->getLoopPreheader()));
ScalarHeader = createVPIRBasicBlock(L->getHeader());
@@ -956,57 +981,57 @@ void VPlan::execute(VPTransformState *State) {
for (VPBlockBase *Block : RPOT)
Block->execute(State);
- State->CFG.DTU.flush();
-
- auto *LoopRegion = getVectorLoopRegion();
- if (!LoopRegion)
- return;
-
- VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
- BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
-
// Fix the latch value of canonical, reduction and first-order recurrences
// phis in the vector loop.
- VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
- for (VPRecipeBase &R : Header->phis()) {
- // Skip phi-like recipes that generate their backedege values themselves.
- if (isa<VPWidenPHIRecipe>(&R))
+ for (VPBasicBlock *Header :
+ VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(Entry))) {
+ if (!Header->isHeader(State->VPDT))
continue;
+ for (VPRecipeBase &R : Header->phis()) {
+ if (isa<VPWidenPHIRecipe>(&R))
+ continue;
- if (isa<VPWidenInductionRecipe>(&R)) {
- PHINode *Phi = nullptr;
- if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
- Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
- } else {
- auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
- assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
- "recipe generating only scalars should have been replaced");
- auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
- Phi = cast<PHINode>(GEP->getPointerOperand());
+ auto *LatchVPBB = cast<VPBasicBlock>(Header->getPredecessors()[1]);
+ BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
+
+ if (isa<VPWidenInductionRecipe>(&R)) {
+ PHINode *Phi = nullptr;
+ if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+ Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
+ } else {
+ auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
+ assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
+ "recipe generating only scalars should have been replaced");
+ auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
+ Phi = cast<PHINode>(GEP->getPointerOperand());
+ }
+
+ Phi->setIncomingBlock(1, VectorLatchBB);
+
+ // Move the last step to the end of the latch block. This ensures
+ // consistent placement of all induction updates.
+ Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
+ Inc->moveBefore(
+ std::prev(VectorLatchBB->getTerminator()->getIterator()));
+
+ // Use the steps for the last part as backedge value for the induction.
+ if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
+ Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
+ continue;
}
- Phi->setIncomingBlock(1, VectorLatchBB);
-
- // Move the last step to the end of the latch block. This ensures
- // consistent placement of all induction updates.
- Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
- Inc->moveBefore(std::prev(VectorLatchBB->getTerminator()->getIterator()));
-
- // Use the steps for the last part as backedge value for the induction.
- if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
- Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
- continue;
+ auto *PhiR = cast<VPSingleDefRecipe>(&R);
+ // VPInstructions currently model scalar Phis only.
+ bool NeedsScalar = isa<VPInstruction>(PhiR) ||
+ (isa<VPReductionPHIRecipe>(PhiR) &&
+ cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
+
+ Value *Phi = State->get(PhiR, NeedsScalar);
+ // VPHeaderPHIRecipe supports getBackedgeValue() but VPInstruction does
+ // not.
+ Value *Val = State->get(PhiR->getOperand(1), NeedsScalar);
+ cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}
-
- auto *PhiR = cast<VPSingleDefRecipe>(&R);
- // VPInstructions currently model scalar Phis only.
- bool NeedsScalar = isa<VPInstruction>(PhiR) ||
- (isa<VPReductionPHIRecipe>(PhiR) &&
- cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
- Value *Phi = State->get(PhiR, NeedsScalar);
- // VPHeaderPHIRecipe supports getBackedgeValue() but VPInstruction does not.
- Value *Val = State->get(PhiR->getOperand(1), NeedsScalar);
- cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}
}
@@ -1365,16 +1390,16 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
#endif
-/// Returns true if there is a vector loop region and \p VPV is defined in a
-/// loop region.
-static bool isDefinedInsideLoopRegions(const VPValue *VPV) {
- const VPRecipeBase *DefR = VPV->getDefiningRecipe();
- return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() ||
- DefR->getParent()->getEnclosingLoopRegion());
-}
-
bool VPValue::isDefinedOutsideLoopRegions() const {
- return !isDefinedInsideLoopRegions(this);
+ auto *DefR = getDefiningRecipe();
+ if (!DefR)
+ return true;
+
+ const VPBasicBlock *DefVPBB = DefR->getParent();
+ auto *Plan = DefVPBB->getPlan();
+ if (Plan->getVectorLoopRegion())
+ return !DefR->getParent()->getEnclosingLoopRegion();
+ return DefVPBB == Plan->getEntry();
}
void VPValue::replaceAllUsesWith(VPValue *New) {
replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 2c4cac7655ec9..3931583233bb6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3415,6 +3415,9 @@ class VPBasicBlock : public VPBlockBase {
/// second predecessor is the exiting block of the region.
const VPBasicBlock *getCFGPredecessor(unsigned Idx) const;
+ /// Returns true if the block is a loop header in a plain-CFG VPlan.
+ bool isHeader(const VPDominatorTree &VPDT) const;
+
protected:
/// Execute the recipes in the IR basic block \p BB.
void executeRecipes(VPTransformState *State, BasicBlock *BB);
@@ -3566,6 +3569,10 @@ class VPRegionBlock : public VPBlockBase {
/// Clone all blocks in the single-entry single-exit region of the block and
/// their recipes without updating the operands of the cloned recipes.
VPRegionBlock *clone() override;
+
+ /// Remove the current region from its VPlan, connecting its predecessor to
+ /// its entry and exiting block to its successor.
+ void removeRegion();
};
/// VPlan models a candidate for vectorization, encoding various decisions take
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6a4ffac200b1c..58da8610a354b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -583,11 +583,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
CondBr->setSuccessor(0, nullptr);
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
- if (!getParent()->isExiting())
+ VPBasicBlock *Header = cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
+ if (!State.CFG.VPBB2IRBB.contains(Header))
return CondBr;
- VPRegionBlock *ParentRegion = getParent()->getParent();
- VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
return CondBr;
}
@@ -598,9 +597,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *Cond = Builder.CreateICmpEQ(IV, TC);
// Now create the branch.
- auto *Plan = getParent()->getPlan();
- VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
- VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
+ VPBasicBlock *Header = cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
// Replace the temporary unreachable terminator with a new conditional
// branch, hooking it up to backward destination (the header) now and to the
@@ -1124,10 +1121,6 @@ void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,
void VPPhi::execute(VPTransformState &State) {
State.setDebugLocFrom(getDebugLoc());
- assert(getParent() ==
- getParent()->getPlan()->getVectorLoopRegion()->getEntry() &&
- "VPInstructions with PHI opcodes must be used for header phis only "
- "at the moment");
BasicBlock *VectorPH = State.CFG.VPBB2IRBB.at(getIncomingBlock(0));
Value *Start = State.get(getIncomingValue(0), VPLane(0));
PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, getName());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b10b47cc1282a..e8481d0bf0c27 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2394,10 +2394,26 @@ void VPlanTransforms::createInterleaveGroups(
void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
Type &CanonicalIVTy) {
+ // Replace loop regions with explicity CFG.
+ SmallVector<VPRegionBlock *> LoopRegions;
+ for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
+ vp_depth_first_deep(Plan.getEntry()))) {
+ if (!R->isReplicator())
+ LoopRegions.push_back(R);
+ }
+ for (VPRegionBlock *R : LoopRegions) {
+ VPBlockBase *Header = R->getEntry();
+ VPBlockBase *Latch = R->getExiting();
+ R->removeRegion();
+ // Add explicit backedge.
+ VPBlockUtils::connectBlocks(Latch, Header);
+ }
+
using namespace llvm::VPlanPatternMatch;
VPTypeAnalysis TypeInfo(&CanonicalIVTy);
SmallVector<VPRecipeBase *> ToRemove;
+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index cb127d37661c7..4b80875b79f13 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -184,8 +184,9 @@ struct VPlanTransforms {
VPRecipeBuilder &RecipeBuilder,
VFRange &Range);
- /// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p
- /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
+ /// Lower abstract recipes to concrete ones, that can be codegen'd and replace
+ /// loop regions with explicit CFG. Use \p CanonicalIVTy as type for all
+ /// un-typed live-ins in VPTypeAnalysis.
static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy);
/// Perform instcombine-like simplifications on recipes in \p Plan. Use \p
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index c0806ea16a5fc..d4494089f7083 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -153,11 +153,10 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3]] = select i1 [[TMP2]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[TMP4]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI2]]
-; CHECK-NEXT: [[TMP5]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI3]]
-; CHECK-NEXT: [[TMP6]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI4]]
+; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP1]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI2]]
+; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP1]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI3]]
+; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[TMP1]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI4]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -196,8 +195,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND15:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0
-; CHECK-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i32> [[VEC_IND15]], <4 x i32> [[VEC_PHI12]]
+; CHECK-NEXT: [[TMP14]] = select <4 x i1> [[TMP11]], <4 x i32> [[VEC_IND15]], <4 x i32> [[VEC_PHI12]]
; CHECK-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX11]], 4
; CHECK-NEXT: [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], splat (i32 4)
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC8]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 969bb413f9c50..c2fe37ad214c6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -74,10 +74,7 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; VSCALEFORTUNING2-NEXT: [[TMP13:%.*]] = and <vscale x 4 x i32> [[TMP12]], splat (i32 1)
; VSCALEFORTUNING2-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i32> [[TMP13]], splat (i32 1)
; VSCALEFORTUNING2-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i32> [[TMP14]] to <vscale x 4 x i64>
-; VSCALEFORTUNING2-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i64> [[TMP15]], i32 0
-; VSCALEFORTUNING2-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[TMP16]]
-; VSCALEFORTUNING2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP17]], i64 0
-; VSCALEFORTUNING2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[DOTSPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; VSCALEFORTUNING2-NEXT: [[DOTSPLAT:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP15]]
; VSCALEFORTUNING2-NEXT: [[TMP18:%.*]] = call i32 @llvm.vscale.i32()
; VSCALEFORTUNING2-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 4
; VSCALEFORTUNING2-NEXT: [[TMP20:%.*]] = sub i32 [[TMP19]], 1
@@ -210,10 +207,7 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
; PRED-NEXT: [[TMP17:%.*]] = and <vscale x 4 x i32> [[TMP16]], splat (i32 1)
; PRED-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i32> [[TMP17]], splat (i32 1)
; PRED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i32> [[TMP18]] to <vscale x 4 x i64>
-; PRED-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i64> [[TMP19]], i32 0
-; PRED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[TMP20]]
-; PRED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP21]], i64 0
-; PRED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[DOTSPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT: [[DOTSPLAT:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP19]]
; PRED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
; PRED-NEXT: [[TMP23:%.*]] = mul i32 [[TMP22]], 4
; PRED-NEXT: [[TMP24:%.*]] = sub i32 [[TMP23]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 567aa63483771..2e9d90f762ccd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -83,27 +83,24 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<vector.ph>:
-; CHECK-NEXT: Successor(s): vector loop
+; CHECK-NEXT: Successor(s): vector.body
; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[EP_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, ir<%add> (VF scaled by 1/4)
-; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]>
-; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
-; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]>
-; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
-; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]>
-; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
-; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]>
-; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
-; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
-; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul>
-; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16>
-; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.block
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[EP_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, ir<%add> (VF scaled by 1/4)
+; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]>
+; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
+; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]>
+; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
+; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]>
+; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
+; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]>
+; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
+; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
+; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul>
+; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16>
+; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024>
+; CHECK-NEXT: Successor(s): middle.block, vector.body
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 9e77a0ca8bcc9..0d77dfc50dd70 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -193,26 +193,23 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %18 = mul i64 %17, 4
; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
-; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
-; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT: WIDEN ir<%add9> = add ir<[[L]]>, ir<1>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
-; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.block
+; CHECK-NEXT: Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
+; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT: WIDEN ir<%add9> = add ir<[[L]]>, ir<1>
+; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
+; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
+; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
+; CHECK-NEXT: Successor(s): middle.block, vector.body
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%0>, ir<[[VEC_TC]]>
@@ -444,26 +441,23 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: IR %18 = mul i64 %17, 4
; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
-; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
-; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00>
-; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%conv1>
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
-; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.block
+; CHECK-NEXT: Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
+; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
+; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
+; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
+; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00>
+; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
+; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
+; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%conv1>
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
+; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
+; CHECK-NEXT: Successor(s): middle.block, vector.body
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%0>, ir<[[VEC_TC]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
index b2ec86ea3ec53..86647b1386ec5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -28,33 +28,30 @@
; IF-EVL-NEXT: IR %n.vec = sub i64 %n.rnd.up, %n.mod.vf
; IF-EVL-NEXT: IR %7 = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: IR %8 = mul i64 %7, 4
- ; IF-EVL-NEXT: Successor(s): vector loop
-
- ; IF-EVL: <x1> vector loop: {
- ; IF-EVL-NEXT: vector.body:
- ; IF-EVL-NEXT: EMIT vp<[[IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[IV_NEXT_EXIT:%.+]]>, vector.body ]
- ; IF-EVL-NEXT: EMIT vp<[[EVL_PHI:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[IV_NEX:%.+]]>, vector.body ]
- ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
- ; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
- ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[EVL_PHI]]>
- ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
- ; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
- ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[EVL_PHI]]>
- ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
- ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
- ; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
- ; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = sub ir<0>, ir<[[LD2]]>
- ; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<[[LD2]]>, ir<[[SUB]]>, vp<[[EVL]]>)
- ; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add vp<[[SELECT]]>, ir<[[LD1]]>
- ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[EVL_PHI]]>
- ; IF-EVL-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]>
- ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
- ; IF-EVL-NEXT: EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
- ; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
- ; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT]]> = add vp<[[IV]]>, ir<[[VFUF]]>
- ; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, ir<[[VTC]]>
- ; IF-EVL-NEXT: No successors
- ; IF-EVL-NEXT: }
+ ; IF-EVL-NEXT: Successor(s): vector.body
+ ; IF-EVL-EMPTY:
+ ; IF-EVL-NEXT: vector.body:
+ ; IF-EVL-NEXT: EMIT vp<[[IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[IV_NEXT_EXIT:%.+]]>, vector.body ]
+ ; IF-EVL-NEXT: EMIT vp<[[EVL_PHI:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[IV_NEX:%.+]]>, vector.body ]
+ ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
+ ; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+ ; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[EVL_PHI]]>
+ ; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+ ; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
+ ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[EVL_PHI]]>
+ ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+ ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
+ ; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
+ ; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = sub ir<0>, ir<[[LD2]]>
+ ; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<[[LD2]]>, ir<[[SUB]]>, vp<[[EVL]]>)
+ ; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add vp<[[SELECT]]>, ir<[[LD1]]>
+ ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[EVL_PHI]]>
+ ; IF-EVL-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]>
+ ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
+ ; IF-EVL-NEXT: EMIT vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+ ; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+ ; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT]]> = add vp<[[IV]]>, ir<[[VFUF]]>
+ ; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, ir<[[VTC]]>
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
index 61a5bd69b7ba3..59e2664cc1402 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -15,75 +15,72 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, 2
; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf
; CHECK-NEXT: vp<[[END:%.+]]> = DERIVED-IV ir<%start> + ir<%n.vec> * ir<1>
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, default.2 ]
-; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, ir<2>
-; CHECK-NEXT: EMIT vp<[[PTR:%.+]]> = ptradd ir<%start>, vp<[[STEPS]]>
-; CHECK-NEXT: vp<[[WIDE_PTR:%.+]]> = vector-pointer vp<[[PTR]]>
-; CHECK-NEXT: WIDEN ir<%l> = load vp<[[WIDE_PTR]]>
-; CHECK-NEXT: EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, ir<-12>
-; CHECK-NEXT: EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, ir<13>
-; CHECK-NEXT: EMIT vp<[[OR_CASES:%.+]]> = or vp<[[C1]]>, vp<[[C2]]>
-; CHECK-NEXT: EMIT vp<[[DEFAULT_MASK:%.+]]> = not vp<[[OR_CASES]]>
-; CHECK-NEXT: Successor(s): pred.store
-; CHECK-EMPTY:
-; CHECK-NEXT: <xVFxUF> pred.store: {
-; CHECK-NEXT: pred.store.entry:
-; CHECK-NEXT: BRANCH-ON-MASK vp<[[C2]]>
-; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue
-; CHECK-EMPTY:
-; CHECK-NEXT: pred.store.if:
-; CHECK-NEXT: REPLICATE store ir<0>, vp<[[PTR]]>
-; CHECK-NEXT: Successor(s): pred.store.continue
-; CHECK-EMPTY:
-; CHECK-NEXT: pred.store.continue:
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): if.then.2.0
+; CHECK-NEXT: Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, default.2 ]
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, ir<2>
+; CHECK-NEXT: EMIT vp<[[PTR:%.+]]> = ptradd ir<%start>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[WIDE_PTR:%.+]]> = vector-pointer vp<[[PTR]]>
+; CHECK-NEXT: WIDEN ir<%l> = load vp<[[WIDE_PTR]]>
+; CHECK-NEXT: EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, ir<-12>
+; CHECK-NEXT: EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, ir<13>
+; CHECK-NEXT: EMIT vp<[[OR_CASES:%.+]]> = or vp<[[C1]]>, vp<[[C2]]>
+; CHECK-NEXT: EMIT vp<[[DEFAULT_MASK:%.+]]> = not vp<[[OR_CASES]]>
+; CHECK-NEXT: Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT: <xVFxUF> pred.store: {
+; CHECK-NEXT: pred.store.entry:
+; CHECK-NEXT: BRANCH-ON-MASK vp<[[C2]]>
+; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT: pred.store.if:
+; CHECK-NEXT: REPLICATE store ir<0>, vp<[[PTR]]>
+; CHECK-NEXT: Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT: pred.store.continue:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): if.then.2.0
; CHECK-EMPTY:
-; CHECK-NEXT: if.then.2.0:
-; CHECK-NEXT: Successor(s): pred.store
+; CHECK-NEXT: if.then.2.0:
+; CHECK-NEXT: Successor(s): pred.store
; CHECK-EMPTY:
-; CHECK-NEXT: <xVFxUF> pred.store: {
-; CHECK-NEXT: pred.store.entry:
-; CHECK-NEXT: BRANCH-ON-MASK vp<[[C1]]>
-; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue
+; CHECK-NEXT: <xVFxUF> pred.store: {
+; CHECK-NEXT: pred.store.entry:
+; CHECK-NEXT: BRANCH-ON-MASK vp<[[C1]]>
+; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue
; CHECK-EMPTY:
; CHECK-NEXT: pred.store.if:
-; CHECK-NEXT: REPLICATE store ir<42>, vp<[[PTR]]>
-; CHECK-NEXT: Successor(s): pred.store.continue
+; CHECK-NEXT: REPLICATE store ir<42>, vp<[[PTR]]>
+; CHECK-NEXT: Successor(s): pred.store.continue
; CHECK-EMPTY:
-; CHECK-NEXT: pred.store.continue:
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): if.then.1.1
+; CHECK-NEXT: pred.store.continue:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): if.then.1.1
; CHECK-EMPTY:
-; CHECK-NEXT: if.then.1.1:
-; CHECK-NEXT: Successor(s): pred.store
+; CHECK-NEXT: if.then.1.1:
+; CHECK-NEXT: Successor(s): pred.store
; CHECK-EMPTY:
-; CHECK-NEXT: <xVFxUF> pred.store: {
-; CHECK-NEXT: pred.store.entry:
-; CHECK-NEXT: BRANCH-ON-MASK vp<[[DEFAULT_MASK]]>
-; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue
+; CHECK-NEXT: <xVFxUF> pred.store: {
+; CHECK-NEXT: pred.store.entry:
+; CHECK-NEXT: BRANCH-ON-MASK vp<[[DEFAULT_MASK]]>
+; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue
; CHECK-EMPTY:
-; CHECK-NEXT: pred.store.if:
-; CHECK-NEXT: REPLICATE store ir<2>, vp<[[PTR]]>
-; CHECK-NEXT: Successor(s): pred.store.continue
+; CHECK-NEXT: pred.store.if:
+; CHECK-NEXT: REPLICATE store ir<2>, vp<[[PTR]]>
+; CHECK-NEXT: Successor(s): pred.store.continue
; CHECK-EMPTY:
-; CHECK-NEXT: pred.store.continue:
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): default.2
-; CHECK-EMPTY:
-; CHECK-NEXT: default.2:
-; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>
-; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VTC]]>
+; CHECK-NEXT: pred.store.continue:
; CHECK-NEXT: No successors
; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.block
+; CHECK-NEXT: Successor(s): default.2
+; CHECK-EMPTY:
+; CHECK-NEXT: default.2:
+; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>
+; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VTC]]>
+; CHECK-NEXT: Successor(s): middle.block, vector.body
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<%0>, ir<[[VTC]]>
>From e3d37546e2ee2850ff140f5ea523cd44ce5ad078 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 12 May 2025 12:32:25 +0100
Subject: [PATCH 2/5] !fixup address latest comments, thanks
---
.../Transforms/Vectorize/LoopVectorize.cpp | 17 +--
llvm/lib/Transforms/Vectorize/VPlan.cpp | 136 +++++++++---------
llvm/lib/Transforms/Vectorize/VPlan.h | 16 +--
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 29 ++--
.../Transforms/Vectorize/VPlanTransforms.cpp | 19 +--
.../Transforms/Vectorize/VPlanTransforms.h | 8 +-
llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 2 +-
llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 12 +-
llvm/lib/Transforms/Vectorize/VPlanUtils.h | 5 +
llvm/lib/Transforms/Vectorize/VPlanValue.h | 4 +-
10 files changed, 127 insertions(+), 121 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index be8decbfaf7c9..f6e3f3a36deb1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2760,15 +2760,6 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
}
-static VPBasicBlock *getHeaderForMainVectorLoop(VPlan &Plan,
- VPDominatorTree &VPDT) {
- return find_singleton<VPBasicBlock>(
- vp_depth_first_shallow(Plan.getEntry()), [&VPDT](VPBlockBase *VPB, bool) {
- auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
- return VPBB && VPBB->isHeader(VPDT) ? VPBB : nullptr;
- });
-}
-
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Fix widened non-induction PHIs by setting up the PHI operands.
if (EnableVPlanNativePath)
@@ -2787,10 +2778,10 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
PSE.getSE()->forgetLoop(OrigLoop);
PSE.getSE()->forgetBlockAndLoopDispositions();
- // Don't apply optimizations below when no vector loop remains, as they all
+ // Don't apply optimizations below when no (vector) loop remains, as they all
// require one at the moment.
VPBasicBlock *HeaderVPBB =
- getHeaderForMainVectorLoop(*State.Plan, State.VPDT);
+ vputils::getTopLevelVectorLoopHeader(*State.Plan, State.VPDT);
if (!HeaderVPBB)
return;
@@ -7811,6 +7802,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPBasicBlock *MiddleVPBB =
BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr;
+ VPlanTransforms::disolveLoopRegions(BestVPlan);
VPlanTransforms::convertToConcreteRecipes(BestVPlan,
*Legal->getWidestInductionType());
@@ -7906,7 +7898,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
- VPBasicBlock *HeaderVPBB = getHeaderForMainVectorLoop(BestVPlan, State.VPDT);
+ VPBasicBlock *HeaderVPBB =
+ vputils::getTopLevelVectorLoopHeader(BestVPlan, State.VPDT);
if (HeaderVPBB) {
MDNode *OrigLoopID = OrigLoop->getLoopID();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 6d35862ef6ad4..8aa769fdbad63 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -207,9 +207,17 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
return Parent->getEnclosingBlockWithPredecessors();
}
-bool VPBasicBlock::isHeader(const VPDominatorTree &VPDT) const {
- return getPredecessors().size() == 2 &&
- VPDT.dominates(this, getPredecessors()[1]);
+bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
+ const VPDominatorTree &VPDT) {
+ auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
+ if (!VPBB)
+ return false;
+ if (auto *R = VPBB->getParent())
+ return !R->isReplicator() && VPBB->getNumPredecessors() == 0;
+
+ assert(!VPB->getParent() && "checking blocks in regions not implemented yet");
+ return VPB->getPredecessors().size() == 2 &&
+ VPDT.dominates(VPB, VPB->getPredecessors()[1]);
}
VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
@@ -425,22 +433,23 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
if (ParentLoop && !State.LI->getLoopFor(NewBB))
ParentLoop->addBasicBlockToLoop(NewBB, *State.LI);
+ auto Preds = to_vector(getHierarchicalPredecessors());
+ if (VPBlockUtils::isHeader(this, State.VPDT)) {
+ // There's no block yet for the latch, don't try to connect it yet.
+ Preds = {Preds[0]};
+ }
+
// Hook up the new basic block to its predecessors.
- for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
+ for (VPBlockBase *PredVPBlock : Preds) {
VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
BasicBlock *PredBB = CFG.VPBB2IRBB.lookup(PredVPBB);
- if (!PredBB)
- continue;
-
assert(PredBB && "Predecessor basic-block not found building successor.");
auto *PredBBTerminator = PredBB->getTerminator();
LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
auto *TermBr = dyn_cast<BranchInst>(PredBBTerminator);
if (isa<UnreachableInst>(PredBBTerminator)) {
- if (PredVPSuccessors.size() == 2)
- continue;
assert(PredVPSuccessors.size() == 1 &&
"Predecessor ending w/o branch must have single successor.");
DebugLoc DL = PredBBTerminator->getDebugLoc();
@@ -496,7 +505,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
bool Replica = bool(State->Lane);
BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
- if (isHeader(State->VPDT)) {
+ if (VPBlockUtils::isHeader(this, State->VPDT)) {
// Create and register the new vector loop.
Loop *PrevParentLoop = State->CurrentParentLoop;
State->CurrentParentLoop = State->LI->AllocateLoop();
@@ -539,11 +548,8 @@ void VPBasicBlock::execute(VPTransformState *State) {
executeRecipes(State, NewBB);
// If this block is a latch, update CurrentParentLoop.
- if (any_of(getSuccessors(), [State, this](VPBlockBase *Succ) {
- auto *VPBB = dyn_cast<VPBasicBlock>(Succ);
- return VPBB && VPBB->isHeader(State->VPDT) &&
- State->VPDT.dominates(Succ, this);
- }))
+ if (getNumSuccessors() == 2 &&
+ VPBlockUtils::isHeader(getSuccessors()[1], State->VPDT))
State->CurrentParentLoop = State->CurrentParentLoop->getParentLoop();
}
@@ -866,10 +872,11 @@ void VPRegionBlock::removeRegion() {
VPBlockUtils::disconnectBlocks(this, Middle);
for (VPBlockBase *VPB : vp_depth_first_shallow(Entry))
- VPB->setParent(nullptr);
+ VPB->setParent(getParent());
VPBlockUtils::connectBlocks(Preheader, Header);
VPBlockUtils::connectBlocks(Exiting, Middle);
+ VPBlockUtils::connectBlocks(Exiting, Header);
}
VPlan::VPlan(Loop *L) {
@@ -981,57 +988,57 @@ void VPlan::execute(VPTransformState *State) {
for (VPBlockBase *Block : RPOT)
Block->execute(State);
+ VPBasicBlock *Header =
+ vputils::getTopLevelVectorLoopHeader(*this, State->VPDT);
+ if (!Header)
+ return;
+
+ auto *LatchVPBB = cast<VPBasicBlock>(Header->getPredecessors()[1]);
+ BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
+
// Fix the latch value of canonical, reduction and first-order recurrences
// phis in the vector loop.
- for (VPBasicBlock *Header :
- VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(Entry))) {
- if (!Header->isHeader(State->VPDT))
+ for (VPRecipeBase &R : Header->phis()) {
+ // Skip phi-like recipes that generate their backedege values themselves.
+ if (isa<VPWidenPHIRecipe>(&R))
continue;
- for (VPRecipeBase &R : Header->phis()) {
- if (isa<VPWidenPHIRecipe>(&R))
- continue;
- auto *LatchVPBB = cast<VPBasicBlock>(Header->getPredecessors()[1]);
- BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
-
- if (isa<VPWidenInductionRecipe>(&R)) {
- PHINode *Phi = nullptr;
- if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
- Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
- } else {
- auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
- assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
- "recipe generating only scalars should have been replaced");
- auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
- Phi = cast<PHINode>(GEP->getPointerOperand());
- }
-
- Phi->setIncomingBlock(1, VectorLatchBB);
-
- // Move the last step to the end of the latch block. This ensures
- // consistent placement of all induction updates.
- Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
- Inc->moveBefore(
- std::prev(VectorLatchBB->getTerminator()->getIterator()));
-
- // Use the steps for the last part as backedge value for the induction.
- if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
- Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
- continue;
+ if (isa<VPWidenInductionRecipe>(&R)) {
+ PHINode *Phi = nullptr;
+ if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+ Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
+ } else {
+ auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
+ assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
+ "recipe generating only scalars should have been replaced");
+ auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
+ Phi = cast<PHINode>(GEP->getPointerOperand());
}
- auto *PhiR = cast<VPSingleDefRecipe>(&R);
- // VPInstructions currently model scalar Phis only.
- bool NeedsScalar = isa<VPInstruction>(PhiR) ||
- (isa<VPReductionPHIRecipe>(PhiR) &&
- cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
-
- Value *Phi = State->get(PhiR, NeedsScalar);
- // VPHeaderPHIRecipe supports getBackedgeValue() but VPInstruction does
- // not.
- Value *Val = State->get(PhiR->getOperand(1), NeedsScalar);
- cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
+ Phi->setIncomingBlock(1, VectorLatchBB);
+
+ // Move the last step to the end of the latch block. This ensures
+ // consistent placement of all induction updates.
+ Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
+ Inc->moveBefore(std::prev(VectorLatchBB->getTerminator()->getIterator()));
+
+ // Use the steps for the last part as backedge value for the induction.
+ if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
+ Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
+ continue;
}
+
+ auto *PhiR = cast<VPSingleDefRecipe>(&R);
+ // VPInstructions currently model scalar Phis only.
+ bool NeedsScalar = isa<VPInstruction>(PhiR) ||
+ (isa<VPReductionPHIRecipe>(PhiR) &&
+ cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
+
+ Value *Phi = State->get(PhiR, NeedsScalar);
+ // VPHeaderPHIRecipe supports getBackedgeValue() but VPInstruction does
+ // not.
+ Value *Val = State->get(PhiR->getOperand(1), NeedsScalar);
+ cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}
}
@@ -1390,17 +1397,18 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
#endif
-bool VPValue::isDefinedOutsideLoopRegions() const {
+bool VPValue::isDefinedOutsideLoop() const {
auto *DefR = getDefiningRecipe();
if (!DefR)
return true;
+ // For non-live-ins, check if is in a region only if the top-level loop region
+ // still exits.
const VPBasicBlock *DefVPBB = DefR->getParent();
auto *Plan = DefVPBB->getPlan();
- if (Plan->getVectorLoopRegion())
- return !DefR->getParent()->getEnclosingLoopRegion();
- return DefVPBB == Plan->getEntry();
+ return Plan->getVectorLoopRegion() && !DefVPBB->getEnclosingLoopRegion();
}
+
void VPValue::replaceAllUsesWith(VPValue *New) {
replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 3931583233bb6..c6608760349e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1589,9 +1589,7 @@ struct VPWidenSelectRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
return getOperand(0);
}
- bool isInvariantCond() const {
- return getCond()->isDefinedOutsideLoopRegions();
- }
+ bool isInvariantCond() const { return getCond()->isDefinedOutsideLoop(); }
/// Returns true if the recipe only uses the first lane of operand \p Op.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
@@ -1604,17 +1602,16 @@ struct VPWidenSelectRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
/// A recipe for handling GEP instructions.
class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
bool isPointerLoopInvariant() const {
- return getOperand(0)->isDefinedOutsideLoopRegions();
+ return getOperand(0)->isDefinedOutsideLoop();
}
bool isIndexLoopInvariant(unsigned I) const {
- return getOperand(I + 1)->isDefinedOutsideLoopRegions();
+ return getOperand(I + 1)->isDefinedOutsideLoop();
}
bool areAllOperandsInvariant() const {
- return all_of(operands(), [](VPValue *Op) {
- return Op->isDefinedOutsideLoopRegions();
- });
+ return all_of(operands(),
+ [](VPValue *Op) { return Op->isDefinedOutsideLoop(); });
}
public:
@@ -3415,9 +3412,6 @@ class VPBasicBlock : public VPBlockBase {
/// second predecessor is the exiting block of the region.
const VPBasicBlock *getCFGPredecessor(unsigned Idx) const;
- /// Returns true if the block is a loop header in a plain-CFG VPlan.
- bool isHeader(const VPDominatorTree &VPDT) const;
-
protected:
/// Execute the recipes in the IR basic block \p BB.
void executeRecipes(VPTransformState *State, BasicBlock *BB);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d90f5c2f19762..52ce8e72bf88b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -576,8 +576,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
case VPInstruction::BranchOnCond: {
Value *Cond = State.get(getOperand(0), VPLane(0));
// Replace the temporary unreachable terminator with a new conditional
- // branch, hooking it up to backward destination for exiting blocks now and
- // to forward destination(s) later when they are created.
+ // branch, hooking it up to backward destination (header) for latch blocks
+ // now to forward destination(s) later when they are created.
BranchInst *CondBr =
Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
CondBr->setSuccessor(0, nullptr);
@@ -600,10 +600,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
VPBasicBlock *Header = cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
// Replace the temporary unreachable terminator with a new conditional
- // branch, hooking it up to backward destination (the header) now and to the
- // forward destination (the exit/middle block) later when it is created.
- // Note that CreateCondBr expects a valid BB as first argument, so we need
- // to set it to nullptr later.
+ // branch, hooking it up to backward destination (the header) for latch
+ // blocks now forward destination (the exit/middle block) later when it is
+ // created. Note that CreateCondBr expects a valid BB as first argument, so
+ // we need to set it to nullptr later.
BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
State.CFG.VPBB2IRBB[Header]);
CondBr->setSuccessor(0, nullptr);
@@ -1560,7 +1560,7 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) {
InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
SelectInst *SI = cast<SelectInst>(getUnderlyingValue());
- bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
+ bool ScalarCond = getOperand(0)->isDefinedOutsideLoop();
Type *ScalarTy = Ctx.Types.inferScalarType(this);
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
@@ -1784,7 +1784,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());
if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
- getOperand(1)->isDefinedOutsideLoopRegions())
+ getOperand(1)->isDefinedOutsideLoop())
RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
@@ -2634,13 +2634,12 @@ static void scalarizeInstruction(const Instruction *Instr,
if (auto *II = dyn_cast<AssumeInst>(Cloned))
State.AC->registerAssumption(II);
- assert(
- (RepRecipe->getParent()->getParent() ||
- !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
- all_of(RepRecipe->operands(),
- [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
- "Expected a recipe is either within a region or all of its operands "
- "are defined outside the vectorized region.");
+ assert((RepRecipe->getParent()->getParent() ||
+ !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
+ all_of(RepRecipe->operands(),
+ [](VPValue *Op) { return Op->isDefinedOutsideLoop(); })) &&
+ "Expected a recipe is either within a region or all of its operands "
+ "are defined outside the vectorized region.");
}
void VPReplicateRecipe::execute(VPTransformState &State) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e8481d0bf0c27..fb6e2d26e35c2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1623,9 +1623,8 @@ static void licm(VPlan &Plan) {
// TODO: Relax checks in the future, e.g. we could also hoist reads, if
// their memory location is not modified in the vector loop.
if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
- any_of(R.operands(), [](VPValue *Op) {
- return !Op->isDefinedOutsideLoopRegions();
- }))
+ any_of(R.operands(),
+ [](VPValue *Op) { return !Op->isDefinedOutsideLoop(); }))
continue;
R.moveBefore(*Preheader, Preheader->end());
}
@@ -2392,8 +2391,7 @@ void VPlanTransforms::createInterleaveGroups(
}
}
-void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
- Type &CanonicalIVTy) {
+void VPlanTransforms::disolveLoopRegions(VPlan &Plan) {
// Replace loop regions with explicity CFG.
SmallVector<VPRegionBlock *> LoopRegions;
for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
@@ -2401,19 +2399,16 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
if (!R->isReplicator())
LoopRegions.push_back(R);
}
- for (VPRegionBlock *R : LoopRegions) {
- VPBlockBase *Header = R->getEntry();
- VPBlockBase *Latch = R->getExiting();
+ for (VPRegionBlock *R : LoopRegions)
R->removeRegion();
- // Add explicit backedge.
- VPBlockUtils::connectBlocks(Latch, Header);
- }
+}
+void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
+ Type &CanonicalIVTy) {
using namespace llvm::VPlanPatternMatch;
VPTypeAnalysis TypeInfo(&CanonicalIVTy);
SmallVector<VPRecipeBase *> ToRemove;
-
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 4b80875b79f13..6f6946e39ce1a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -184,9 +184,11 @@ struct VPlanTransforms {
VPRecipeBuilder &RecipeBuilder,
VFRange &Range);
- /// Lower abstract recipes to concrete ones, that can be codegen'd and replace
- /// loop regions with explicit CFG. Use \p CanonicalIVTy as type for all
- /// un-typed live-ins in VPTypeAnalysis.
+ /// Replace loop regions with explicit CFG.
+ static void disolveLoopRegions(VPlan &Plan);
+
+ /// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p
+ /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy);
/// Perform instcombine-like simplifications on recipes in \p Plan. Use \p
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index ce83c276297c0..447648018e514 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -246,7 +246,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
}
if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
if (isa<StoreInst>(RepR->getUnderlyingValue()) &&
- RepR->getOperand(1)->isDefinedOutsideLoopRegions()) {
+ RepR->getOperand(1)->isDefinedOutsideLoop()) {
// Stores to an invariant address only need to store the last part.
remapOperands(&R, UF - 1);
return;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 2db4957409c8d..fda0c70aaf5c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "VPlanUtils.h"
+#include "VPlanCFG.h"
#include "VPlanPatternMatch.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -87,7 +88,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
return true;
VPRecipeBase *R = V->getDefiningRecipe();
- if (R && V->isDefinedOutsideLoopRegions()) {
+ if (R && V->isDefinedOutsideLoop()) {
if (match(V->getDefiningRecipe(),
m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
m_VPValue())))
@@ -124,3 +125,12 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
return false;
});
}
+
+VPBasicBlock *vputils::getTopLevelVectorLoopHeader(VPlan &Plan,
+ VPDominatorTree &VPDT) {
+ auto DepthFirst = vp_depth_first_shallow(Plan.getEntry());
+ auto I = find_if(DepthFirst, [&VPDT](VPBlockBase *VPB) {
+ return VPBlockUtils::isHeader(VPB, VPDT);
+ });
+ return I == DepthFirst.end() ? nullptr : cast<VPBasicBlock>(*I);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 67329a6d6953c..f2febb2282b4f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -94,6 +94,8 @@ bool isHeaderMask(const VPValue *V, VPlan &Plan);
/// VPDerivedIV or VPCanonicalIVPHI).
bool isUniformAcrossVFsAndUFs(VPValue *V);
+/// Returns the header block of the top-level vector loop, if one exists.
+VPBasicBlock *getTopLevelVectorLoopHeader(VPlan &Plan, VPDominatorTree &VPDT);
} // namespace vputils
//===----------------------------------------------------------------------===//
@@ -240,6 +242,9 @@ class VPBlockUtils {
VPBlockUtils::connectBlocks(From, BlockPtr, -1, SuccIdx);
VPBlockUtils::connectBlocks(BlockPtr, To, PredIx, -1);
}
+
+ /// Returns true if \p VPB is a loop header.
+ static bool isHeader(const VPBlockBase *VPBB, const VPDominatorTree &VPDT);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 638156eab7a84..3e3ce920170e0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -181,8 +181,8 @@ class VPValue {
return getUnderlyingValue();
}
- /// Returns true if the VPValue is defined outside any loop region.
- bool isDefinedOutsideLoopRegions() const;
+ /// Returns true if the VPValue is defined outside any loop.
+ bool isDefinedOutsideLoop() const;
// Set \p Val as the underlying Value of this VPValue.
void setUnderlyingValue(Value *Val) {
>From 3a064e7c13d0725d77c312fa60d70e86fb32a033 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 14 May 2025 10:17:52 +0100
Subject: [PATCH 3/5] !fixup address latest comments, thanks!
---
.../Transforms/Vectorize/LoopVectorize.cpp | 8 ++++---
llvm/lib/Transforms/Vectorize/VPlan.cpp | 23 +++++++++++++------
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 +++----
llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 3 +--
llvm/lib/Transforms/Vectorize/VPlanUtils.h | 13 +++++++----
5 files changed, 35 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 31724f0a80106..505f1aad46cff 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2781,7 +2781,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Don't apply optimizations below when no (vector) loop remains, as they all
// require one at the moment.
VPBasicBlock *HeaderVPBB =
- vputils::getTopLevelVectorLoopHeader(*State.Plan, State.VPDT);
+ vputils::getFirstLoopHeader(*State.Plan, State.VPDT);
if (!HeaderVPBB)
return;
@@ -7801,6 +7801,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
VPlanTransforms::removeDeadRecipes(BestVPlan);
+ // Retrieve and store the middle block before dissolving regions. Regions are
+ // dissolved after optimizing for VF and UF, which completely removes unneeded
+ // loop regions first.
VPBasicBlock *MiddleVPBB =
BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr;
VPlanTransforms::disolveLoopRegions(BestVPlan);
@@ -7899,8 +7902,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
- VPBasicBlock *HeaderVPBB =
- vputils::getTopLevelVectorLoopHeader(BestVPlan, State.VPDT);
+ VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(BestVPlan, State.VPDT);
if (HeaderVPBB) {
MDNode *OrigLoopID = OrigLoop->getLoopID();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8aa769fdbad63..8ac819a61ee26 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -212,14 +212,24 @@ bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
if (!VPBB)
return false;
+
+ // If VPBB is in a region R, VPBB is a loop header if R is a loop region with
+ // VPBB as its entry, i.e., free of predecessors.
if (auto *R = VPBB->getParent())
return !R->isReplicator() && VPBB->getNumPredecessors() == 0;
- assert(!VPB->getParent() && "checking blocks in regions not implemented yet");
+ // A header dominates its second predecessor (the latch), with the other
+ // predecessor being the preheader
return VPB->getPredecessors().size() == 2 &&
VPDT.dominates(VPB, VPB->getPredecessors()[1]);
}
+bool VPBlockUtils::isLatch(const VPBlockBase *VPB,
+ const VPDominatorTree &VPDT) {
+ return VPB->getNumSuccessors() == 2 &&
+ VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT);
+}
+
VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
iterator It = begin();
while (It != end() && It->isPhi())
@@ -435,7 +445,7 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
auto Preds = to_vector(getHierarchicalPredecessors());
if (VPBlockUtils::isHeader(this, State.VPDT)) {
- // There's no block yet for the latch, don't try to connect it yet.
+ // There's no block for the latch yet, connect to the preheader only.
Preds = {Preds[0]};
}
@@ -443,8 +453,9 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
for (VPBlockBase *PredVPBlock : Preds) {
VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
+ assert(CFG.VPBB2IRBB.contains(PredVPBB) &&
+ "Predecessor basic-block not found building successor.");
BasicBlock *PredBB = CFG.VPBB2IRBB.lookup(PredVPBB);
- assert(PredBB && "Predecessor basic-block not found building successor.");
auto *PredBBTerminator = PredBB->getTerminator();
LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
@@ -548,8 +559,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
executeRecipes(State, NewBB);
// If this block is a latch, update CurrentParentLoop.
- if (getNumSuccessors() == 2 &&
- VPBlockUtils::isHeader(getSuccessors()[1], State->VPDT))
+ if (VPBlockUtils::isLatch(this, State->VPDT))
State->CurrentParentLoop = State->CurrentParentLoop->getParentLoop();
}
@@ -988,8 +998,7 @@ void VPlan::execute(VPTransformState *State) {
for (VPBlockBase *Block : RPOT)
Block->execute(State);
- VPBasicBlock *Header =
- vputils::getTopLevelVectorLoopHeader(*this, State->VPDT);
+ VPBasicBlock *Header = vputils::getFirstLoopHeader(*this, State->VPDT);
if (!Header)
return;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 497e3652c7a01..2a22da0365770 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -577,7 +577,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *Cond = State.get(getOperand(0), VPLane(0));
// Replace the temporary unreachable terminator with a new conditional
// branch, hooking it up to backward destination (header) for latch blocks
- // now to forward destination(s) later when they are created.
+ // now, and to forward destination(s) later when they are created.
BranchInst *CondBr =
Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
CondBr->setSuccessor(0, nullptr);
@@ -601,9 +601,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
// Replace the temporary unreachable terminator with a new conditional
// branch, hooking it up to backward destination (the header) for latch
- // blocks now forward destination (the exit/middle block) later when it is
- // created. Note that CreateCondBr expects a valid BB as first argument, so
- // we need to set it to nullptr later.
+ // blocks now, and to forward destination (the exit/middle block) later when
+ // it is created. Note that CreateCondBr expects a valid BB as first
+ // argument, so we need to set it to nullptr later.
BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
State.CFG.VPBB2IRBB[Header]);
CondBr->setSuccessor(0, nullptr);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index fda0c70aaf5c6..ad49008945ef2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -126,8 +126,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
});
}
-VPBasicBlock *vputils::getTopLevelVectorLoopHeader(VPlan &Plan,
- VPDominatorTree &VPDT) {
+VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) {
auto DepthFirst = vp_depth_first_shallow(Plan.getEntry());
auto I = find_if(DepthFirst, [&VPDT](VPBlockBase *VPB) {
return VPBlockUtils::isHeader(VPB, VPDT);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index f2febb2282b4f..cb99b64ad6f25 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -94,8 +94,9 @@ bool isHeaderMask(const VPValue *V, VPlan &Plan);
/// VPDerivedIV or VPCanonicalIVPHI).
bool isUniformAcrossVFsAndUFs(VPValue *V);
-/// Returns the header block of the top-level vector loop, if one exists.
-VPBasicBlock *getTopLevelVectorLoopHeader(VPlan &Plan, VPDominatorTree &VPDT);
+/// Returns the header block of the first, top-level loop, or null if none
+/// exist.
+VPBasicBlock *getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT);
} // namespace vputils
//===----------------------------------------------------------------------===//
@@ -243,8 +244,12 @@ class VPBlockUtils {
VPBlockUtils::connectBlocks(BlockPtr, To, PredIx, -1);
}
- /// Returns true if \p VPB is a loop header.
- static bool isHeader(const VPBlockBase *VPBB, const VPDominatorTree &VPDT);
+ /// Returns true if \p VPB is a loop header, based on regions or \p VPDT in
+ /// their absence.
+ static bool isHeader(const VPBlockBase *VPB, const VPDominatorTree &VPDT);
+
+ /// Returns true if \p VPB is a loop latch, using isHeader().
+ static bool isLatch(const VPBlockBase *VPB, const VPDominatorTree &VPDT);
};
} // namespace llvm
>From 9cae5f728f2fd2e1e8a484ddd136af269db43f94 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 14 May 2025 13:51:48 +0100
Subject: [PATCH 4/5] !fixup address latest comments, thanks
---
llvm/lib/Transforms/Vectorize/VPlan.cpp | 24 +++++----
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 50 ++++++++-----------
2 files changed, 35 insertions(+), 39 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8ac819a61ee26..58aff840a03fb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -226,6 +226,9 @@ bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
bool VPBlockUtils::isLatch(const VPBlockBase *VPB,
const VPDominatorTree &VPDT) {
+ // A latch has a header as its second successor, with its other successor
+ // leaving the loop. A preheader OTOH has a header as its first (and only)
+ // successor.
return VPB->getNumSuccessors() == 2 &&
VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT);
}
@@ -455,7 +458,7 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
assert(CFG.VPBB2IRBB.contains(PredVPBB) &&
"Predecessor basic-block not found building successor.");
- BasicBlock *PredBB = CFG.VPBB2IRBB.lookup(PredVPBB);
+ BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
auto *PredBBTerminator = PredBB->getTerminator();
LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
@@ -1406,18 +1409,17 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
#endif
-bool VPValue::isDefinedOutsideLoop() const {
- auto *DefR = getDefiningRecipe();
- if (!DefR)
- return true;
-
- // For non-live-ins, check if is in a region only if the top-level loop region
- // still exits.
- const VPBasicBlock *DefVPBB = DefR->getParent();
- auto *Plan = DefVPBB->getPlan();
- return Plan->getVectorLoopRegion() && !DefVPBB->getEnclosingLoopRegion();
+/// Returns true if there is a vector loop region and \p VPV is defined in a
+/// loop region.
+static bool isDefinedInsideLoopRegions(const VPValue *VPV) {
+ const VPRecipeBase *DefR = VPV->getDefiningRecipe();
+ return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() ||
+ DefR->getParent()->getEnclosingLoopRegion());
}
+bool VPValue::isDefinedOutsideLoop() const {
+ return !isDefinedInsideLoopRegions(this);
+}
void VPValue::replaceAllUsesWith(VPValue *New) {
replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2a22da0365770..106d0ec8c65b1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -456,6 +456,26 @@ Value *VPInstruction::generatePerLane(VPTransformState &State,
State.get(getOperand(1), Lane), Name);
}
+/// Create a conditional branch using \p Cond branching to the successors of \p
+/// VPBB. Note that the first successor is always forward (i.e. not created yet)
+/// while the second successor may already have been created (if it is a header
+/// block and VPBB is a header).
+static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB,
+ VPTransformState &State) {
+ // Replace the temporary unreachable terminator with a new conditional
+ // branch, hooking it up to backward destination (header) for latch blocks
+ // now, and to forward destination(s) later when they are created.
+ // Second successor may be backwards - iff it is already in VPBB2IRBB.
+ VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(VPBB->getSuccessors()[1]);
+ BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(SecondVPSucc);
+ BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB];
+ BranchInst *CondBr = State.Builder.CreateCondBr(Cond, IRBB, SecondIRSucc);
+ // First successor is always forward, reset it to nullptr
+ CondBr->setSuccessor(0, nullptr);
+ IRBB->getTerminator()->eraseFromParent();
+ return CondBr;
+}
+
Value *VPInstruction::generate(VPTransformState &State) {
IRBuilderBase &Builder = State.Builder;
@@ -575,40 +595,14 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
case VPInstruction::BranchOnCond: {
Value *Cond = State.get(getOperand(0), VPLane(0));
- // Replace the temporary unreachable terminator with a new conditional
- // branch, hooking it up to backward destination (header) for latch blocks
- // now, and to forward destination(s) later when they are created.
- BranchInst *CondBr =
- Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
- CondBr->setSuccessor(0, nullptr);
- Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
-
- VPBasicBlock *Header = cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
- if (!State.CFG.VPBB2IRBB.contains(Header))
- return CondBr;
-
- CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
- return CondBr;
+ return createCondBranch(Cond, getParent(), State);
}
case VPInstruction::BranchOnCount: {
// First create the compare.
Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
Value *TC = State.get(getOperand(1), /*IsScalar*/ true);
Value *Cond = Builder.CreateICmpEQ(IV, TC);
-
- // Now create the branch.
- VPBasicBlock *Header = cast<VPBasicBlock>(getParent()->getSuccessors()[1]);
-
- // Replace the temporary unreachable terminator with a new conditional
- // branch, hooking it up to backward destination (the header) for latch
- // blocks now, and to forward destination (the exit/middle block) later when
- // it is created. Note that CreateCondBr expects a valid BB as first
- // argument, so we need to set it to nullptr later.
- BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
- State.CFG.VPBB2IRBB[Header]);
- CondBr->setSuccessor(0, nullptr);
- Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
- return CondBr;
+ return createCondBranch(Cond, getParent(), State);
}
case VPInstruction::Broadcast: {
return Builder.CreateVectorSplat(
>From 6c169e04f30c564771feee083faa71ce52f8b28d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 17 May 2025 22:48:53 +0100
Subject: [PATCH 5/5] !fixup use ::isHeader in VPlan verifier.
---
llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 68b35d42e8674..54cf8ac2ed04a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -73,9 +73,7 @@ bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) {
auto RecipeI = VPBB->begin();
auto End = VPBB->end();
unsigned NumActiveLaneMaskPhiRecipes = 0;
- const VPRegionBlock *ParentR = VPBB->getParent();
- bool IsHeaderVPBB = ParentR && !ParentR->isReplicator() &&
- ParentR->getEntryBasicBlock() == VPBB;
+ bool IsHeaderVPBB = VPBlockUtils::isHeader(VPBB, VPDT);
while (RecipeI != End && RecipeI->isPhi()) {
if (isa<VPActiveLaneMaskPHIRecipe>(RecipeI))
NumActiveLaneMaskPhiRecipes++;
More information about the llvm-commits
mailing list