[llvm] [VPlan] Dispatch to multiple exit blocks via middle blocks. (PR #112138)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 6 12:41:13 PST 2024
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/112138
>From 245b56a80bca6369a9be3102308617f2a4a4d51b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 1 Oct 2024 20:57:24 +0100
Subject: [PATCH 1/9] [VPlan] Support VPIRBBs and VPIRInst phis with multiple
predecessors.
---
llvm/lib/Transforms/Vectorize/VPlan.cpp | 5 ++++-
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 9 +++++----
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c1b97791331bcf..8609514c39e7d0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1056,7 +1056,10 @@ void VPlan::execute(VPTransformState *State) {
State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}});
// Generate code in the loop pre-header and body.
- for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ Entry);
+
+ for (VPBlockBase *Block : RPOT)
Block->execute(State);
VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 41f13cc2d9a978..be3e958320e771 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -857,12 +857,13 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
void VPIRInstruction::execute(VPTransformState &State) {
assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
"Only PHINodes can have extra operands");
- if (getNumOperands() == 1) {
- VPValue *ExitValue = getOperand(0);
+ for (const auto &[Idx, Op] : enumerate(operands())) {
+ VPValue *ExitValue = Op;
auto Lane = vputils::isUniformAfterVectorization(ExitValue)
? VPLane::getFirstLane()
: VPLane::getLastLaneForVF(State.VF);
- auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
+ VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
+ auto *PredVPBB = Pred->getExitingBasicBlock();
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
// Set insertion point in PredBB in case an extract needs to be generated.
// TODO: Model extracts explicitly.
@@ -890,7 +891,7 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
O << Indent << "IR " << I;
if (getNumOperands() != 0) {
- assert(getNumOperands() == 1 && "can have at most 1 operand");
+ // assert(getNumOperands() == 1 && "can have at most 1 operand");
O << " (extra operand: ";
printOperands(O, SlotTracker);
O << ")";
>From 47258deea863675e43fd7fd48376dce131441dc5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 18 Sep 2024 21:35:57 +0100
Subject: [PATCH 2/9] [VPlan] Dispatch to multiple exit blocks via middle
blocks.
A more lightweight variant of https://github.com/llvm/llvm-project/pull/109193,
which dispatches to multiple exit blocks via the middle blocks.
---
.../Vectorize/LoopVectorizationLegality.h | 3 +
.../Vectorize/LoopVectorizationLegality.cpp | 29 +++
.../Transforms/Vectorize/LoopVectorize.cpp | 82 +++---
llvm/lib/Transforms/Vectorize/VPlan.cpp | 39 ++-
llvm/lib/Transforms/Vectorize/VPlan.h | 1 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 16 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 82 ++++++
.../Transforms/Vectorize/VPlanTransforms.h | 4 +
.../Transforms/Vectorize/VPlanVerifier.cpp | 8 -
.../LoopVectorize/X86/multi-exit-codegen.ll | 240 ++++++++++++++++++
.../LoopVectorize/X86/multi-exit-cost.ll | 18 +-
.../LoopVectorize/X86/multi-exit-vplan.ll | 148 +++++++++++
12 files changed, 614 insertions(+), 56 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index dc7e484a40a452..af6fae44cf0f09 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -287,6 +287,9 @@ class LoopVectorizationLegality {
/// we can use in-order reductions.
bool canVectorizeFPMath(bool EnableStrictReductions);
+ /// Returns true if the loop has an early exit that we can vectorize.
+ bool canVectorizeEarlyExit() const;
+
/// Return true if we can vectorize this loop while folding its tail by
/// masking.
bool canFoldTailByMasking() const;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 43be72f0f34d45..ee53d28a4c8282 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -43,6 +43,10 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
cl::desc("Enable recognition of non-constant strided "
"pointer induction variables."));
+static cl::opt<bool>
+ EnableEarlyExitVectorization("enable-early-exit-vectorization",
+ cl::init(false), cl::Hidden, cl::desc(""));
+
namespace llvm {
cl::opt<bool>
HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
@@ -1378,6 +1382,10 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
}
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
+ // When vectorizing early exits, create predicates for all blocks, except the
+ // header.
+ if (canVectorizeEarlyExit() && BB != TheLoop->getHeader())
+ return true;
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}
@@ -1514,6 +1522,27 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
return true;
}
+bool LoopVectorizationLegality::canVectorizeEarlyExit() const {
+ // Currently only allow vectorizing loops with early exits, if early-exit
+ // vectorization is explicitly enabled and the loop has metadata to force
+ // vectorization.
+ if (!EnableEarlyExitVectorization)
+ return false;
+
+ SmallVector<BasicBlock *> Exiting;
+ TheLoop->getExitingBlocks(Exiting);
+ if (Exiting.size() == 1)
+ return false;
+
+ LoopVectorizeHints Hints(TheLoop, true, *ORE);
+ if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+ return false;
+
+ Function *Fn = TheLoop->getHeader()->getParent();
+ return Hints.allowVectorization(Fn, TheLoop,
+ true /*VectorizeOnlyWhenForced*/);
+}
+
// Helper function to canVectorizeLoopNestCFG.
bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
bool UseVPlanNativePath) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8653498d32a12..befe8f7c0076a3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1363,9 +1363,11 @@ class LoopVectorizationCostModel {
// If we might exit from anywhere but the latch, must run the exiting
// iteration in scalar form.
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
- LLVM_DEBUG(
- dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
- return true;
+ if (!Legal->canVectorizeEarlyExit()) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+ return true;
+ }
}
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
@@ -2575,7 +2577,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
- assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
+ assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector()) ||
+ Legal->canVectorizeEarlyExit()) &&
"multiple exit loop without required epilogue?");
LoopMiddleBlock =
@@ -2758,8 +2761,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
// value (the value that feeds into the phi from the loop latch).
// We allow both, but they, obviously, have different values.
- assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
-
DenseMap<Value *, Value *> MissingVals;
// An external user of the last iteration's value should see the value that
@@ -2819,6 +2820,9 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
PHI->addIncoming(I.second, MiddleBlock);
}
+
+ assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) &&
+ "Expected a single exit block");
}
namespace {
@@ -3599,7 +3603,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
TheLoop->getExitingBlocks(Exiting);
for (BasicBlock *E : Exiting) {
auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
- if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+ if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() &&
+ (TheLoop->getLoopLatch() == E || !Legal->canVectorizeEarlyExit()))
AddToWorklistIfAllowed(Cmp);
}
@@ -7692,12 +7697,15 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan.execute(&State);
// 2.5 Collect reduction resume values.
- auto *ExitVPBB =
- cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
- for (VPRecipeBase &R : *ExitVPBB) {
- createAndCollectMergePhiForReduction(
- dyn_cast<VPInstruction>(&R), State, OrigLoop,
- State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+ VPBasicBlock *ExitVPBB = nullptr;
+ if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {
+ ExitVPBB = cast<VPBasicBlock>(
+ BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+ for (VPRecipeBase &R : *ExitVPBB) {
+ createAndCollectMergePhiForReduction(
+ dyn_cast<VPInstruction>(&R), State, OrigLoop,
+ State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+ }
}
// 2.6. Maintain Loop Hints
@@ -7723,6 +7731,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
LoopVectorizeHints Hints(L, true, *ORE);
Hints.setAlreadyVectorized();
}
+
TargetTransformInfo::UnrollingPreferences UP;
TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
@@ -7735,15 +7744,17 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
ILV.printDebugTracesAtEnd();
// 4. Adjust branch weight of the branch in the middle block.
- auto *MiddleTerm =
- cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
- if (MiddleTerm->isConditional() &&
- hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
- // Assume that `Count % VectorTripCount` is equally distributed.
- unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
- assert(TripCount > 0 && "trip count should not be zero");
- const uint32_t Weights[] = {1, TripCount - 1};
- setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+ if (ExitVPBB) {
+ auto *MiddleTerm =
+ cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
+ if (MiddleTerm->isConditional() &&
+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+ // Assume that `Count % VectorTripCount` is equally distributed.
+ unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
+ assert(TripCount > 0 && "trip count should not be zero");
+ const uint32_t Weights[] = {1, TripCount - 1};
+ setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+ }
}
return State.ExpandedSCEVs;
@@ -8128,7 +8139,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
// If source is an exiting block, we know the exit edge is dynamically dead
// in the vector loop, and thus we don't need to restrict the mask. Avoid
// adding uses of an otherwise potentially dead instruction.
- if (OrigLoop->isLoopExiting(Src))
+ if (!Legal->canVectorizeEarlyExit() && OrigLoop->isLoopExiting(Src))
return EdgeMaskCache[Edge] = SrcMask;
VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8778,6 +8789,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
static SetVector<VPIRInstruction *> collectUsersInExitBlock(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+ if (!Plan.getVectorLoopRegion()->getSingleSuccessor())
+ return {};
auto *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
// No edge from the middle block to the unique exit block has been inserted
@@ -8863,6 +8876,8 @@ static void addLiveOutsForFirstOrderRecurrences(
// TODO: Should be replaced by
// Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
// scalar region is modeled as well.
+ if (!VectorRegion->getSingleSuccessor())
+ return;
auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
@@ -9146,10 +9161,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();
- SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
- addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
- addUsersInExitBlock(*Plan, ExitUsersToFix);
+ if (Legal->canVectorizeEarlyExit()) {
+ VPlanTransforms::convertToMultiCond(*Plan, *PSE.getSE(), OrigLoop,
+ RecipeBuilder);
+ } else {
+ SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
+ OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+ addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
+ addUsersInExitBlock(*Plan, ExitUsersToFix);
+ }
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -9277,8 +9297,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
using namespace VPlanPatternMatch;
VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
- VPBasicBlock *MiddleVPBB =
- cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
for (VPRecipeBase &R : Header->phis()) {
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
@@ -9297,8 +9315,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (VPUser *U : Cur->users()) {
auto *UserRecipe = cast<VPSingleDefRecipe>(U);
if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
- assert(UserRecipe->getParent() == MiddleVPBB &&
- "U must be either in the loop region or the middle block.");
continue;
}
Worklist.insert(UserRecipe);
@@ -9403,6 +9419,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
}
VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
Builder.setInsertPoint(&*LatchVPBB->begin());
+ if (!VectorLoopRegion->getSingleSuccessor())
+ return;
+ VPBasicBlock *MiddleVPBB =
+ cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
for (VPRecipeBase &R :
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8609514c39e7d0..eb7c808551340d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -474,6 +474,14 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
// backedges. A backward successor is set when the branch is created.
const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+ if (TermBr->getSuccessor(idx) &&
+ PredVPBlock == getPlan()->getVectorLoopRegion() &&
+ PredVPBlock->getNumSuccessors()) {
+ // Update PRedBB and TermBr for BranchOnMultiCond in predecessor.
+ PredBB = TermBr->getSuccessor(1);
+ TermBr = cast<BranchInst>(PredBB->getTerminator());
+ idx = 0;
+ }
assert(!TermBr->getSuccessor(idx) &&
"Trying to reset an existing successor block.");
TermBr->setSuccessor(idx, IRBB);
@@ -908,8 +916,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
- VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
if (!RequiresScalarEpilogueCheck) {
+ VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
return Plan;
}
@@ -923,10 +931,14 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
// we unconditionally branch to the scalar preheader. Do nothing.
// 3) Otherwise, construct a runtime check.
BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
- auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
- // The connection order corresponds to the operands of the conditional branch.
- VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
- VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ if (IRExitBlock) {
+ auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+ // The connection order corresponds to the operands of the conditional
+ // branch.
+ VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
+ VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+ VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ }
auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
// Here we use the same DebugLoc as the scalar loop latch terminator instead
@@ -1031,7 +1043,9 @@ void VPlan::execute(VPTransformState *State) {
// VPlan execution rather than earlier during VPlan construction.
BasicBlock *MiddleBB = State->CFG.ExitBB;
VPBasicBlock *MiddleVPBB =
- cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ getVectorLoopRegion()->getNumSuccessors() == 1
+ ? cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[0])
+ : cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[1]);
// Find the VPBB for the scalar preheader, relying on the current structure
// when creating the middle block and its successrs: if there's a single
// predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1044,6 +1058,10 @@ void VPlan::execute(VPTransformState *State) {
MiddleSuccs.size() == 1 ? MiddleSuccs[0] : MiddleSuccs[1]);
assert(!isa<VPIRBasicBlock>(ScalarPhVPBB) &&
"scalar preheader cannot be wrapped already");
+ if (ScalarPhVPBB->getNumSuccessors() != 0) {
+ ScalarPhVPBB = cast<VPBasicBlock>(ScalarPhVPBB->getSuccessors()[1]);
+ MiddleVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
+ }
replaceVPBBWithIRVPBB(ScalarPhVPBB, ScalarPh);
replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
@@ -1065,6 +1083,10 @@ void VPlan::execute(VPTransformState *State) {
VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
+ if (!getVectorLoopRegion()->getSingleSuccessor())
+ VectorLatchBB =
+ cast<BranchInst>(VectorLatchBB->getTerminator())->getSuccessor(1);
+
// Fix the latch value of canonical, reduction and first-order recurrences
// phis in the vector loop.
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
@@ -1091,7 +1113,10 @@ void VPlan::execute(VPTransformState *State) {
// Move the last step to the end of the latch block. This ensures
// consistent placement of all induction updates.
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
- Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+ if (VectorLatchBB->getTerminator() == &*VectorLatchBB->getFirstNonPHI())
+ Inc->moveBefore(VectorLatchBB->getTerminator());
+ else
+ Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
// Use the steps for the last part as backedge value for the induction.
if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 59a084401cc9bf..21f44eac188936 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1274,6 +1274,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
PtrAdd,
+ AnyOf,
};
private:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index be3e958320e771..9d5c609ad26043 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -67,6 +67,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
default:
return true;
}
+ case VPExpandSCEVSC:
+ return getParent()->getPlan()->getTripCount() == getVPSingleValue();
case VPInterleaveSC:
return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
case VPWidenStoreEVLSC:
@@ -160,6 +162,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPPredInstPHISC:
case VPScalarCastSC:
return false;
+ case VPExpandSCEVSC:
+ return getParent()->getPlan()->getTripCount() == getVPSingleValue();
case VPInstructionSC:
return mayWriteToMemory();
case VPWidenCallSC: {
@@ -399,6 +403,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::PtrAdd:
case VPInstruction::ExplicitVectorLength:
+ case VPInstruction::AnyOf:
return true;
default:
return false;
@@ -674,6 +679,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
return NewPhi;
}
+ case VPInstruction::AnyOf: {
+ Value *A = State.get(getOperand(0));
+ return Builder.CreateOrReduce(A);
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
@@ -682,7 +691,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractFromEnd ||
- getOpcode() == VPInstruction::ComputeReductionResult;
+ getOpcode() == VPInstruction::ComputeReductionResult ||
+ getOpcode() == VPInstruction::AnyOf;
}
bool VPInstruction::isSingleScalar() const {
@@ -745,6 +755,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
return false;
case Instruction::ICmp:
case Instruction::Select:
+ case Instruction::Or:
case VPInstruction::PtrAdd:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
@@ -840,6 +851,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::PtrAdd:
O << "ptradd";
break;
+ case VPInstruction::AnyOf:
+ O << "any-of";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d50f3c0c3f3e04..a86498eb9aa30c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -515,6 +515,12 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
Plan.getEntry());
+ for (VPRecipeBase &R : make_early_inc_range(
+ reverse(*cast<VPBasicBlock>(Plan.getPreheader())))) {
+ if (isDeadRecipe(R))
+ R.eraseFromParent();
+ }
+
for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
// The recipes in the block are processed in reverse order, to catch chains
// of dead recipes.
@@ -1696,3 +1702,79 @@ void VPlanTransforms::createInterleaveGroups(
}
}
}
+
+void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
+ Loop *OrigLoop,
+ VPRecipeBuilder &RecipeBuilder) {
+ auto *LatchVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
+ VPBuilder Builder(LatchVPBB->getTerminator());
+ auto *MiddleVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+
+ const SCEV *BackedgeTakenCount =
+ SE.getExitCount(OrigLoop, OrigLoop->getLoopLatch());
+ const SCEV *TripCount = SE.getTripCountFromExitCount(
+ BackedgeTakenCount, Plan.getCanonicalIV()->getScalarType(), OrigLoop);
+ VPValue *NewTC = vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE);
+ Plan.getTripCount()->replaceAllUsesWith(NewTC);
+ Plan.resetTripCount(NewTC);
+
+ VPValue *EarlyExitTaken = nullptr;
+ SmallVector<BasicBlock *> ExitingBBs;
+ OrigLoop->getExitingBlocks(ExitingBBs);
+ for (BasicBlock *Exiting : ExitingBBs) {
+ auto *ExitingTerm = cast<BranchInst>(Exiting->getTerminator());
+ BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
+ BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
+ VPIRBasicBlock *VPExitBlock;
+ if (OrigLoop->getUniqueExitBlock())
+ VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
+ else
+ VPExitBlock = VPIRBasicBlock::fromBasicBlock(
+ !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+
+ for (VPRecipeBase &R : *VPExitBlock) {
+ auto *ExitIRI = cast<VPIRInstruction>(&R);
+ auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
+ if (!ExitPhi)
+ break;
+ Value *IncomingValue = ExitPhi->getIncomingValueForBlock(Exiting);
+ VPValue *V = RecipeBuilder.getVPValueOrAddLiveIn(IncomingValue);
+ ExitIRI->addOperand(V);
+ }
+
+ if (Exiting == OrigLoop->getLoopLatch()) {
+ if (MiddleVPBB->getNumSuccessors() == 0) {
+ VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+ VPBlockUtils::connectBlocks(MiddleVPBB, VPExitBlock);
+ VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ }
+ continue;
+ }
+
+ VPValue *M = RecipeBuilder.getBlockInMask(
+ OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+ auto *N = Builder.createNot(M);
+ EarlyExitTaken = Builder.createNaryOp(VPInstruction::AnyOf, {N});
+
+ VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split");
+ VPBlockUtils::disconnectBlocks(LoopRegion, MiddleVPBB);
+ VPBlockUtils::insertBlockAfter(NewMiddle, LoopRegion);
+ VPBlockUtils::connectBlocks(NewMiddle, VPExitBlock);
+ VPBlockUtils::connectBlocks(NewMiddle, MiddleVPBB);
+
+ VPBuilder MiddleBuilder(NewMiddle);
+ MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {EarlyExitTaken});
+ // MiddleVPBB = NewMiddle;
+ }
+ auto *Term = dyn_cast<VPInstruction>(LatchVPBB->getTerminator());
+ auto *IsLatchExiting = Builder.createICmp(
+ CmpInst::ICMP_EQ, Term->getOperand(0), Term->getOperand(1));
+ auto *AnyExiting =
+ Builder.createNaryOp(Instruction::Or, {EarlyExitTaken, IsLatchExiting});
+ Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExiting);
+ Term->eraseFromParent();
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 60a44bfb0dca6b..9745211db275f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -123,6 +123,10 @@ struct VPlanTransforms {
/// Remove dead recipes from \p Plan.
static void removeDeadRecipes(VPlan &Plan);
+
+ static void convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
+ Loop *OrigLoop,
+ VPRecipeBuilder &RecipeBuilder);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 7ea5ee341cc547..1ac79f8887ab46 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -248,14 +248,6 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
return false;
}
- VPBlockBase *MiddleBB =
- IRBB->getPlan()->getVectorLoopRegion()->getSingleSuccessor();
- if (IRBB != IRBB->getPlan()->getPreheader() &&
- IRBB->getSinglePredecessor() != MiddleBB) {
- errs() << "VPIRBasicBlock can only be used as pre-header or a successor of "
- "middle-block at the moment!\n";
- return false;
- }
return true;
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
new file mode 100644
index 00000000000000..0c33715c6bd271
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization %s | FileCheck --check-prefix=MULTI %s
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization=false %s | FileCheck --check-prefix=DEFAULT %s
+
+define i64 @multi_exit_with_store(ptr %p, i64 %N) {
+; MULTI-LABEL: define i64 @multi_exit_with_store(
+; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; MULTI-NEXT: [[ENTRY:.*]]:
+; MULTI-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; MULTI: [[VECTOR_PH]]:
+; MULTI-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; MULTI-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; MULTI-NEXT: br label %[[VECTOR_BODY:.*]]
+; MULTI: [[VECTOR_BODY]]:
+; MULTI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; MULTI-NEXT: [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; MULTI-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
+; MULTI-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
+; MULTI-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]])
+; MULTI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; MULTI-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; MULTI-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; MULTI-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; MULTI-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; MULTI-NEXT: br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; MULTI: [[MIDDLE_SPLIT]]:
+; MULTI-NEXT: br i1 [[TMP6]], label %[[E1:.*]], label %[[MIDDLE_BLOCK:.*]]
+; MULTI: [[MIDDLE_BLOCK]]:
+; MULTI-NEXT: br i1 true, label %[[E2:.*]], label %[[SCALAR_PH]]
+; MULTI: [[SCALAR_PH]]:
+; MULTI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MULTI-NEXT: br label %[[LOOP_HEADER:.*]]
+; MULTI: [[LOOP_HEADER]]:
+; MULTI-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; MULTI-NEXT: [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
+; MULTI-NEXT: br i1 [[CMP1]], label %[[E1]], label %[[LOOP_LATCH]]
+; MULTI: [[LOOP_LATCH]]:
+; MULTI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
+; MULTI-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+; MULTI-NEXT: [[INC]] = add nuw i64 [[I_07]], 1
+; MULTI-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
+; MULTI-NEXT: br i1 [[CMP_NOT]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; MULTI: [[E1]]:
+; MULTI-NEXT: ret i64 0
+; MULTI: [[E2]]:
+; MULTI-NEXT: ret i64 1
+;
+; DEFAULT-LABEL: define i64 @multi_exit_with_store(
+; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT: [[ENTRY:.*]]:
+; DEFAULT-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
+; DEFAULT-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 4
+; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT: [[VECTOR_PH]]:
+; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
+; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; DEFAULT-NEXT: [[TMP2:%.*]] = select i1 [[TMP5]], i64 4, i64 [[N_MOD_VF]]
+; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP2]]
+; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
+; DEFAULT: [[VECTOR_BODY]]:
+; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]]
+; DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; DEFAULT-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP3]], align 4
+; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT: [[MIDDLE_BLOCK]]:
+; DEFAULT-NEXT: br label %[[SCALAR_PH]]
+; DEFAULT: [[SCALAR_PH]]:
+; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]]
+; DEFAULT: [[LOOP_HEADER]]:
+; DEFAULT-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; DEFAULT-NEXT: [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
+; DEFAULT-NEXT: br i1 [[CMP1]], label %[[E1:.*]], label %[[LOOP_LATCH]]
+; DEFAULT: [[LOOP_LATCH]]:
+; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
+; DEFAULT-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+; DEFAULT-NEXT: [[INC]] = add nuw i64 [[I_07]], 1
+; DEFAULT-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
+; DEFAULT-NEXT: br i1 [[CMP_NOT]], label %[[E2:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT: [[E1]]:
+; DEFAULT-NEXT: ret i64 0
+; DEFAULT: [[E2]]:
+; DEFAULT-NEXT: ret i64 1
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %c.1 = icmp uge i64 %iv, %N
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+ store i32 0, ptr %arrayidx
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
+
+e1:
+ ret i64 0
+
+e2:
+ ret i64 1
+}
+
+define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
+; MULTI-LABEL: define i64 @multi_exiting_to_same_exit_with_store(
+; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; MULTI-NEXT: [[ENTRY:.*]]:
+; MULTI-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; MULTI: [[VECTOR_PH]]:
+; MULTI-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; MULTI-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; MULTI-NEXT: br label %[[VECTOR_BODY:.*]]
+; MULTI: [[VECTOR_BODY]]:
+; MULTI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; MULTI-NEXT: [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; MULTI-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
+; MULTI-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
+; MULTI-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]])
+; MULTI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; MULTI-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; MULTI-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; MULTI-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; MULTI-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; MULTI-NEXT: br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; MULTI: [[MIDDLE_SPLIT]]:
+; MULTI-NEXT: br i1 [[TMP6]], label %[[E:.*]], label %[[MIDDLE_BLOCK:.*]]
+; MULTI: [[MIDDLE_BLOCK]]:
+; MULTI-NEXT: br i1 true, label %[[E]], label %[[SCALAR_PH]]
+; MULTI: [[SCALAR_PH]]:
+; MULTI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MULTI-NEXT: br label %[[LOOP_HEADER:.*]]
+; MULTI: [[LOOP_HEADER]]:
+; MULTI-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; MULTI-NEXT: [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]]
+; MULTI-NEXT: br i1 [[C_1]], label %[[E]], label %[[LOOP_LATCH]]
+; MULTI: [[LOOP_LATCH]]:
+; MULTI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
+; MULTI-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+; MULTI-NEXT: [[INC]] = add nuw i64 [[IV]], 1
+; MULTI-NEXT: [[C_2:%.*]] = icmp eq i64 [[INC]], 128
+; MULTI-NEXT: br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; MULTI: [[E]]:
+; MULTI-NEXT: [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ 1, %[[MIDDLE_SPLIT]] ]
+; MULTI-NEXT: ret i64 [[P1]]
+;
+; DEFAULT-LABEL: define i64 @multi_exiting_to_same_exit_with_store(
+; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT: [[ENTRY:.*]]:
+; DEFAULT-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
+; DEFAULT-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
+; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT: [[VECTOR_PH]]:
+; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; DEFAULT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]]
+; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
+; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
+; DEFAULT: [[VECTOR_BODY]]:
+; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; DEFAULT-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP3]]
+; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; DEFAULT-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
+; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT: [[MIDDLE_BLOCK]]:
+; DEFAULT-NEXT: br label %[[SCALAR_PH]]
+; DEFAULT: [[SCALAR_PH]]:
+; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]]
+; DEFAULT: [[LOOP_HEADER]]:
+; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; DEFAULT-NEXT: [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]]
+; DEFAULT-NEXT: br i1 [[C_1]], label %[[E:.*]], label %[[LOOP_LATCH]]
+; DEFAULT: [[LOOP_LATCH]]:
+; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
+; DEFAULT-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+; DEFAULT-NEXT: [[INC]] = add nuw i64 [[IV]], 1
+; DEFAULT-NEXT: [[C_2:%.*]] = icmp eq i64 [[INC]], 128
+; DEFAULT-NEXT: br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT: [[E]]:
+; DEFAULT-NEXT: [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ]
+; DEFAULT-NEXT: ret i64 [[P1]]
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %c.1 = icmp uge i64 %iv, %N
+ br i1 %c.1, label %e, label %loop.latch
+
+loop.latch:
+ %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+ store i32 0, ptr %arrayidx
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e, label %loop.header, !llvm.loop !1
+
+e:
+ %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
+ ret i64 %p1
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; MULTI: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; MULTI: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; MULTI: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; MULTI: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; MULTI: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; MULTI: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
index cd128979fc1431..1c02f10753745c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
@@ -5,18 +5,18 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64
; CHECK-LABEL: define i64 @test_value_in_exit_compare_chain_used_outside(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[X:%.*]], i64 range(i64 1, 32) [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[N]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]]
-; CHECK-NEXT: [[UMIN2:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[X]])
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[UMIN2]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
-; CHECK: [[VECTOR_SCEVCHECK]]:
; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[N]], -1
; CHECK-NEXT: [[TMP4:%.*]] = freeze i64 [[TMP3]]
; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[X]])
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[UMIN]] to i1
-; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[UMIN]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK: [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT: [[TMP32:%.*]] = add nsw i64 [[N]], -1
+; CHECK-NEXT: [[TMP33:%.*]] = freeze i64 [[TMP32]]
+; CHECK-NEXT: [[UMIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP33]], i64 [[X]])
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[UMIN1]] to i1
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[UMIN1]], 1
; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
; CHECK-NEXT: br i1 [[TMP7]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
new file mode 100644
index 00000000000000..5c5d532b93bc89
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
+
+define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %c.1 = icmp uge i64 %iv, %N
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+ store i32 0, ptr %arrayidx
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
+
+e1:
+ %p1 = phi i64 [ 0, %loop.header ]
+ ret i64 %p1
+
+e2:
+ %p2 = phi i64 [ 1, %loop.latch ]
+ ret i64 %p2
+}
+
+define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e>:
+; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<0>, ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %c.1 = icmp uge i64 %iv, %N
+ br i1 %c.1, label %e, label %loop.latch
+
+loop.latch:
+ %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+ store i32 0, ptr %arrayidx
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e, label %loop.header, !llvm.loop !1
+
+e:
+ %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
+ ret i64 %p1
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
>From 3831acb97053230cb09f8316ce1ada17be50564c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 31 Oct 2024 19:48:40 +0000
Subject: [PATCH 3/9] !fixup address first set of comments, thanks!
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 13 ++-----------
llvm/lib/Transforms/Vectorize/VPlan.cpp | 16 +---------------
2 files changed, 3 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3a69750460f4d8..80a0fda81aeaf1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7711,16 +7711,13 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan.execute(&State);
// 2.5 Collect reduction resume values.
- VPBasicBlock *ExitVPBB = nullptr;
- if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {
- ExitVPBB = cast<VPBasicBlock>(
- BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+ VPBasicBlock *ExitVPBB =
+ cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
if (VectorizingEpilogue)
for (VPRecipeBase &R : *ExitVPBB) {
fixReductionScalarResumeWhenVectorizingEpilog(
&R, State, State.CFG.VPBB2IRBB[ExitVPBB]);
}
- }
// 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
@@ -8809,8 +8806,6 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
static SetVector<VPIRInstruction *> collectUsersInExitBlock(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
- if (!Plan.getVectorLoopRegion()->getSingleSuccessor())
- return {};
auto *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
// No edge from the middle block to the unique exit block has been inserted
@@ -8896,8 +8891,6 @@ static void addLiveOutsForFirstOrderRecurrences(
// TODO: Should be replaced by
// Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
// scalar region is modeled as well.
- if (!VectorRegion->getSingleSuccessor())
- return;
auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
@@ -9447,8 +9440,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
}
VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
Builder.setInsertPoint(&*LatchVPBB->begin());
- if (!VectorLoopRegion->getSingleSuccessor())
- return;
VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index be3579b4cd5dde..4032468b4f76da 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -474,14 +474,6 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
// backedges. A backward successor is set when the branch is created.
const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
- if (TermBr->getSuccessor(idx) &&
- PredVPBlock == getPlan()->getVectorLoopRegion() &&
- PredVPBlock->getNumSuccessors()) {
- // Update PRedBB and TermBr for BranchOnMultiCond in predecessor.
- PredBB = TermBr->getSuccessor(1);
- TermBr = cast<BranchInst>(PredBB->getTerminator());
- idx = 0;
- }
assert(!TermBr->getSuccessor(idx) &&
"Trying to reset an existing successor block.");
TermBr->setSuccessor(idx, IRBB);
@@ -1043,9 +1035,7 @@ void VPlan::execute(VPTransformState *State) {
// VPlan execution rather than earlier during VPlan construction.
BasicBlock *MiddleBB = State->CFG.ExitBB;
VPBasicBlock *MiddleVPBB =
- getVectorLoopRegion()->getNumSuccessors() == 1
- ? cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[0])
- : cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[1]);
+ cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
// Find the VPBB for the scalar preheader, relying on the current structure
// when creating the middle block and its successrs: if there's a single
// predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1083,10 +1073,6 @@ void VPlan::execute(VPTransformState *State) {
VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
- if (!getVectorLoopRegion()->getSingleSuccessor())
- VectorLatchBB =
- cast<BranchInst>(VectorLatchBB->getTerminator())->getSuccessor(1);
-
// Fix the latch value of canonical, reduction and first-order recurrences
// phis in the vector loop.
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
>From 64db0eea4073e1cdc3d394155754ed0653ca0c3d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 4 Nov 2024 21:10:24 +0000
Subject: [PATCH 4/9] !fixup clean up merge failures
---
llvm/lib/Transforms/Vectorize/VPlan.cpp | 18 ++++++++++--------
llvm/lib/Transforms/Vectorize/VPlan.h | 4 ++--
.../Transforms/Vectorize/VPlanTransforms.cpp | 11 ++---------
.../LoopVectorize/X86/multi-exit-vplan.ll | 11 ++++++++++-
4 files changed, 24 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 924b63bbd9639f..e2c063928e9906 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -911,7 +911,6 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader);
if (!RequiresScalarEpilogueCheck) {
- VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
return Plan;
}
@@ -925,14 +924,17 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
// we unconditionally branch to the scalar preheader. Do nothing.
// 3) Otherwise, construct a runtime check.
BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
- if (IRExitBlock) {
- auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
- // The connection order corresponds to the operands of the conditional
- // branch.
- VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
- VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
- VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ if (!IRExitBlock) {
+ auto *Term = cast<BranchInst>(TheLoop->getLoopLatch()->getTerminator());
+ IRExitBlock = TheLoop->contains(Term->getSuccessor(0))
+ ? Term->getSuccessor(1)
+ : Term->getSuccessor(0);
}
+ auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+ // The connection order corresponds to the operands of the conditional
+ // branch.
+ VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
+ VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
// Here we use the same DebugLoc as the scalar loop latch terminator instead
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 861fea7ff469ff..8efa648a7e1ea7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3822,10 +3822,10 @@ class VPlan {
/// whether to execute the scalar tail loop or the exit block from the loop
/// latch.
const VPBasicBlock *getMiddleBlock() const {
- return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
}
VPBasicBlock *getMiddleBlock() {
- return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
}
/// The trip count of the original loop.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index eed3efd666e712..1369693b01971c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1797,8 +1797,7 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
auto *LatchVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
VPBuilder Builder(LatchVPBB->getTerminator());
- auto *MiddleVPBB =
- cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+ auto *MiddleVPBB = Plan.getMiddleBlock();
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
@@ -1818,7 +1817,7 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
VPIRBasicBlock *VPExitBlock;
- if (OrigLoop->getUniqueExitBlock())
+ if (OrigLoop->getUniqueExitBlock() || Exiting == OrigLoop->getLoopLatch())
VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
else
VPExitBlock = VPIRBasicBlock::fromBasicBlock(
@@ -1835,11 +1834,6 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
}
if (Exiting == OrigLoop->getLoopLatch()) {
- if (MiddleVPBB->getNumSuccessors() == 0) {
- VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
- VPBlockUtils::connectBlocks(MiddleVPBB, VPExitBlock);
- VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
- }
continue;
}
@@ -1856,7 +1850,6 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
VPBuilder MiddleBuilder(NewMiddle);
MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {EarlyExitTaken});
- // MiddleVPBB = NewMiddle;
}
auto *Term = dyn_cast<VPInstruction>(LatchVPBB->getTerminator());
auto *IsLatchExiting = Builder.createICmp(
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
index 5c5d532b93bc89..47304c571bfcb1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
@@ -49,9 +49,13 @@ define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT: IR %c.1 = icmp uge i64 %iv, %N
; CHECK-NEXT: No successors
; CHECK-NEXT: }
-;
entry:
br label %loop.header
@@ -120,6 +124,11 @@ define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
; CHECK-NEXT: Successor(s): ir-bb<e>, scalar.ph
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT: IR %c.1 = icmp uge i64 %iv, %N
; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
>From 0f8aedfaf89bc6dbe18bd00e6bad0aad52db10f3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 5 Nov 2024 13:50:43 +0000
Subject: [PATCH 5/9] !fixup address latest comments, thanks!
---
.../Vectorize/LoopVectorizationLegality.cpp | 8 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 121 ++++++++++--------
llvm/lib/Transforms/Vectorize/VPlan.cpp | 13 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 48 +++----
.../Transforms/Vectorize/VPlanTransforms.h | 6 +-
.../LoopVectorize/X86/multi-exit-vplan.ll | 82 +-----------
6 files changed, 98 insertions(+), 180 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 0d8bda5a2112c7..ed3808d2f30bf1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -43,10 +43,6 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
cl::desc("Enable recognition of non-constant strided "
"pointer induction variables."));
-static cl::opt<bool>
- EnableEarlyExitVectorization("enable-early-exit-vectorization",
- cl::init(false), cl::Hidden, cl::desc(""));
-
namespace llvm {
cl::opt<bool>
HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
@@ -1381,7 +1377,7 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
// When vectorizing early exits, create predicates for all blocks, except the
// header.
- if (canVectorizeEarlyExit() && BB != TheLoop->getHeader())
+ if (hasUncountableEarlyExit() && BB != TheLoop->getHeader())
return true;
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}
@@ -1523,8 +1519,6 @@ bool LoopVectorizationLegality::canVectorizeEarlyExit() const {
// Currently only allow vectorizing loops with early exits, if early-exit
// vectorization is explicitly enabled and the loop has metadata to force
// vectorization.
- if (!EnableEarlyExitVectorization)
- return false;
SmallVector<BasicBlock *> Exiting;
TheLoop->getExitingBlocks(Exiting);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 60607b7cf6b46c..47e78a916f8cea 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -384,6 +384,11 @@ static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
cl::Hidden,
cl::desc("Try wider VFs if they enable the use of vector variants"));
+static cl::opt<bool> EnableEarlyExitVectorization(
+ "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Enable vectorization of early exit loops with uncountable exits."));
+
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
// variables not overflowing do not hold. See `emitSCEVChecks`.
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
@@ -1358,14 +1363,13 @@ class LoopVectorizationCostModel {
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
return false;
}
- // If we might exit from anywhere but the latch, must run the exiting
- // iteration in scalar form.
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
- if (!Legal->canVectorizeEarlyExit()) {
- LLVM_DEBUG(
- dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
- return true;
- }
+ // If we might exit from anywhere but the latch and early exit vectorization
+ // is disabled, we must run the exiting iteration in scalar form.
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
+ !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+ return true;
}
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
@@ -2576,7 +2580,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
assert(LoopVectorPreHeader && "Invalid loop structure");
LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector()) ||
- Legal->canVectorizeEarlyExit()) &&
+ Legal->hasUncountableEarlyExit()) &&
"multiple exit loop without required epilogue?");
LoopMiddleBlock =
@@ -2809,6 +2813,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
}
}
+ assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) &&
+ "Expected a single exit block for escaping values");
for (auto &I : MissingVals) {
PHINode *PHI = cast<PHINode>(I.first);
// One corner case we have to handle is two IVs "chasing" each-other,
@@ -2819,9 +2825,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
PHI->addIncoming(I.second, MiddleBlock);
}
-
- assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) &&
- "Expected a single exit block");
}
namespace {
@@ -3597,7 +3600,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
for (BasicBlock *E : Exiting) {
auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() &&
- (TheLoop->getLoopLatch() == E || !Legal->canVectorizeEarlyExit()))
+ (TheLoop->getLoopLatch() == E || !Legal->hasUncountableEarlyExit()))
AddToWorklistIfAllowed(Cmp);
}
@@ -8144,7 +8147,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
// If source is an exiting block, we know the exit edge is dynamically dead
// in the vector loop, and thus we don't need to restrict the mask. Avoid
// adding uses of an otherwise potentially dead instruction.
- if (!Legal->canVectorizeEarlyExit() && OrigLoop->isLoopExiting(Src))
+ if (!Legal->hasUncountableEarlyExit() && OrigLoop->isLoopExiting(Src))
return EdgeMaskCache[Edge] = SrcMask;
VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8835,39 +8838,43 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
static SetVector<VPIRInstruction *> collectUsersInExitBlock(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
- auto *MiddleVPBB = Plan.getMiddleBlock();
- // No edge from the middle block to the unique exit block has been inserted
- // and there is nothing to fix from vector loop; phis should have incoming
- // from scalar loop only.
- if (MiddleVPBB->getNumSuccessors() != 2)
- return {};
SetVector<VPIRInstruction *> ExitUsersToFix;
- VPBasicBlock *ExitVPBB = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
- BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
- for (VPRecipeBase &R : *ExitVPBB) {
- auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
- if (!ExitIRI)
- continue;
- auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
- if (!ExitPhi)
- break;
- Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
- VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
- // Exit values for inductions are computed and updated outside of VPlan and
- // independent of induction recipes.
- // TODO: Compute induction exit values in VPlan.
- if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
- !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
- isa<VPWidenPointerInductionRecipe>(V) ||
- (isa<Instruction>(IncomingValue) &&
- OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
- any_of(IncomingValue->users(), [&Inductions](User *U) {
- auto *P = dyn_cast<PHINode>(U);
- return P && Inductions.contains(P);
- })))
+ for (VPBlockBase *VPB : vp_depth_first_shallow(
+ Plan.getVectorLoopRegion()->getSingleSuccessor())) {
+ if (VPB->getNumSuccessors() != 0 || VPB == Plan.getScalarHeader())
continue;
- ExitUsersToFix.insert(ExitIRI);
- ExitIRI->addOperand(V);
+ auto *ExitVPBB = cast<VPIRBasicBlock>(VPB);
+ BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
+ BasicBlock *ExitingBB = find_singleton<BasicBlock>(
+ to_vector(predecessors(ExitBB)),
+ [OrigLoop](BasicBlock *Pred, bool AllowRepeats) {
+ return OrigLoop->contains(Pred) ? Pred : nullptr;
+ });
+ for (VPRecipeBase &R : *ExitVPBB) {
+ auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
+ if (!ExitIRI)
+ continue;
+ auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
+ if (!ExitPhi)
+ break;
+ Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
+ VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
+ // Exit values for inductions are computed and updated outside of VPlan
+ // and independent of induction recipes.
+ // TODO: Compute induction exit values in VPlan.
+ if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
+ !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
+ isa<VPWidenPointerInductionRecipe>(V) ||
+ (isa<Instruction>(IncomingValue) &&
+ OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
+ any_of(IncomingValue->users(), [&Inductions](User *U) {
+ auto *P = dyn_cast<PHINode>(U);
+ return P && Inductions.contains(P);
+ })))
+ continue;
+ ExitUsersToFix.insert(ExitIRI);
+ ExitIRI->addOperand(V);
+ }
}
return ExitUsersToFix;
}
@@ -9168,16 +9175,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();
- if (Legal->canVectorizeEarlyExit()) {
- VPlanTransforms::convertToMultiCond(*Plan, *PSE.getSE(), OrigLoop,
- RecipeBuilder);
- } else {
+ if (Legal->hasUncountableEarlyExit()) {
+ VPlanTransforms::handleUncountableEarlyExit(*Plan, *PSE.getSE(), OrigLoop,
+ RecipeBuilder);
+ }
addScalarResumePhis(RecipeBuilder, *Plan);
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
addUsersInExitBlock(*Plan, ExitUsersToFix);
- }
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -9940,12 +9946,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
}
if (LVL.hasUncountableEarlyExit()) {
- reportVectorizationFailure("Auto-vectorization of loops with uncountable "
- "early exit is not yet supported",
- "Auto-vectorization of loops with uncountable "
- "early exit is not yet supported",
- "UncountableEarlyExitLoopsUnsupported", ORE, L);
- return false;
+ if (!EnableEarlyExitVectorization) {
+ reportVectorizationFailure("Auto-vectorization of loops with uncountable "
+ "early exit is not yet supported",
+ "Auto-vectorization of loops with uncountable "
+ "early exit is not yet supported",
+ "UncountableEarlyExitLoopsUnsupported", ORE,
+ L);
+ return false;
+ }
}
// Entrance to the VPlan-native vectorization path. Outer loops are processed
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index ed2d6aa5f3df4b..1c609d89c47b9b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -878,15 +878,9 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
auto Plan = std::make_unique<VPlan>(Entry, VecPreheader, ScalarHeader);
// Create SCEV and VPValue for the trip count.
-
- // Currently only loops with countable exits are vectorized, but calling
- // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
- // uncountable exits whilst also ensuring the symbolic maximum and known
- // back-edge taken count remain identical for loops with countable exits.
+ // We use the symbolic max backedge-taken-count, which is used when
+ // vectorizing loops with uncountable early exits
const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
- assert((!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
- BackedgeTakenCountSCEV == PSE.getBackedgeTakenCount()) &&
- "Invalid loop count");
ScalarEvolution &SE = *PSE.getSE();
const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
InductionTy, TheLoop);
@@ -922,6 +916,9 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
// 3) Otherwise, construct a runtime check.
BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
if (!IRExitBlock) {
+ // If there's no unique exit block (i.e. vectorizing with an uncountable
+ // early exit), use the block exiting from the latch. The other uncountable
+ // exit blocks will be added later.
auto *Term = cast<BranchInst>(TheLoop->getLoopLatch()->getTerminator());
IRExitBlock = TheLoop->contains(Term->getSuccessor(0))
? Term->getSuccessor(1)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1369693b01971c..d336827b23ddf3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1791,51 +1791,33 @@ void VPlanTransforms::createInterleaveGroups(
}
}
-void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
- Loop *OrigLoop,
- VPRecipeBuilder &RecipeBuilder) {
+void VPlanTransforms::handleUncountableEarlyExit(
+ VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
+ VPRecipeBuilder &RecipeBuilder) {
auto *LatchVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
VPBuilder Builder(LatchVPBB->getTerminator());
auto *MiddleVPBB = Plan.getMiddleBlock();
-
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
- const SCEV *BackedgeTakenCount =
- SE.getExitCount(OrigLoop, OrigLoop->getLoopLatch());
- const SCEV *TripCount = SE.getTripCountFromExitCount(
- BackedgeTakenCount, Plan.getCanonicalIV()->getScalarType(), OrigLoop);
- VPValue *NewTC = vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE);
- Plan.getTripCount()->replaceAllUsesWith(NewTC);
- Plan.resetTripCount(NewTC);
-
VPValue *EarlyExitTaken = nullptr;
SmallVector<BasicBlock *> ExitingBBs;
OrigLoop->getExitingBlocks(ExitingBBs);
+
+ // Process all uncountable exiting blocks. For each exiting block, update the
+ // EarlyExitTaken, which tracks if any uncountable early exit has been taken.
+ // Also split the middle block and branch to the exit block for the early exit
+ // if it has been taken.
for (BasicBlock *Exiting : ExitingBBs) {
+ if (Exiting == OrigLoop->getLoopLatch())
+ continue;
+
auto *ExitingTerm = cast<BranchInst>(Exiting->getTerminator());
BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
VPIRBasicBlock *VPExitBlock;
- if (OrigLoop->getUniqueExitBlock() || Exiting == OrigLoop->getLoopLatch())
- VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
- else
- VPExitBlock = VPIRBasicBlock::fromBasicBlock(
- !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
-
- for (VPRecipeBase &R : *VPExitBlock) {
- auto *ExitIRI = cast<VPIRInstruction>(&R);
- auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
- if (!ExitPhi)
- break;
- Value *IncomingValue = ExitPhi->getIncomingValueForBlock(Exiting);
- VPValue *V = RecipeBuilder.getVPValueOrAddLiveIn(IncomingValue);
- ExitIRI->addOperand(V);
- }
-
- if (Exiting == OrigLoop->getLoopLatch()) {
- continue;
- }
+ VPExitBlock = VPIRBasicBlock::fromBasicBlock(
+ !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
VPValue *M = RecipeBuilder.getBlockInMask(
OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
@@ -1851,6 +1833,10 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
VPBuilder MiddleBuilder(NewMiddle);
MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {EarlyExitTaken});
}
+
+ // Replace the condition controlling the exit from the vector loop with one
+ // exiting if either the original condition of the vector latch is true or any
+ // early exit has been taken.
auto *Term = dyn_cast<VPInstruction>(LatchVPBB->getTerminator());
auto *IsLatchExiting = Builder.createICmp(
CmpInst::ICMP_EQ, Term->getOperand(0), Term->getOperand(1));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index b5b7144568ff54..f7bbae25279fce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -124,9 +124,9 @@ struct VPlanTransforms {
/// Remove dead recipes from \p Plan.
static void removeDeadRecipes(VPlan &Plan);
- static void convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
- Loop *OrigLoop,
- VPRecipeBuilder &RecipeBuilder);
+ static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE,
+ Loop *OrigLoop,
+ VPRecipeBuilder &RecipeBuilder);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
index 47304c571bfcb1..5da97ef8f9b3bc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
+declare void @init(ptr)
+
define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
@@ -57,16 +59,18 @@ define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
; CHECK-NEXT: No successors
; CHECK-NEXT: }
entry:
+ %src = alloca [128 x i32]
+ call void @init(ptr %src)
br label %loop.header
loop.header:
%iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
- %c.1 = icmp uge i64 %iv, %N
+ %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+ %l = load i32, ptr %gep.src
+ %c.1 = icmp eq i32 %l, 10
br i1 %c.1, label %e1, label %loop.latch
loop.latch:
- %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
- store i32 0, ptr %arrayidx
%inc = add nuw i64 %iv, 1
%c.2 = icmp eq i64 %inc, 128
br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
@@ -80,78 +84,6 @@ e2:
ret i64 %p2
}
-define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
-; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<128> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
-; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
-; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
-; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
-; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
-; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
-; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.split
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.split:
-; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]>
-; CHECK-NEXT: Successor(s): ir-bb<e>, middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e>:
-; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<0>, ir<1>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
-; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<e>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: ir-bb<loop.header>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-; CHECK-NEXT: IR %c.1 = icmp uge i64 %iv, %N
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-;
-entry:
- br label %loop.header
-
-loop.header:
- %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
- %c.1 = icmp uge i64 %iv, %N
- br i1 %c.1, label %e, label %loop.latch
-
-loop.latch:
- %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
- store i32 0, ptr %arrayidx
- %inc = add nuw i64 %iv, 1
- %c.2 = icmp eq i64 %inc, 128
- br i1 %c.2, label %e, label %loop.header, !llvm.loop !1
-
-e:
- %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
- ret i64 %p1
-}
-
!1 = distinct !{!1, !2, !3}
!2 = !{!"llvm.loop.vectorize.width", i32 4}
!3 = !{!"llvm.loop.vectorize.enable", i1 true}
>From 9212f9647692387fb4a6ff8cc6a90b1fa2b73628 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 5 Nov 2024 15:07:50 +0000
Subject: [PATCH 6/9] [VPlan] Generalize collectUsersInExitBlocks for multiple
exit bbs.
Generalize collectUsersInExitBlock to collecting exit users in multiple
exit blocks. Exit blocks are leaf nodes in the VPlan (without
successors) except the scalar header.
Split off from https://github.com/llvm/llvm-project/pull/112138
---
.../Transforms/Vectorize/LoopVectorize.cpp | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 47e78a916f8cea..4ccbc8e95d1edf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8830,12 +8830,12 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
}
}
-// Collect VPIRInstructions for phis in the original exit block that are modeled
+// Collect VPIRInstructions for phis in the exit blocks that are modeled
// in VPlan and add the exiting VPValue as operand. Some exiting values are not
// modeled explicitly yet and won't be included. Those are un-truncated
// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
// increments.
-static SetVector<VPIRInstruction *> collectUsersInExitBlock(
+static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
SetVector<VPIRInstruction *> ExitUsersToFix;
@@ -8882,8 +8882,8 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
// Add exit values to \p Plan. Extracts are added for each entry in \p
// ExitUsersToFix if needed and their operands are updated.
static void
-addUsersInExitBlock(VPlan &Plan,
- const SetVector<VPIRInstruction *> &ExitUsersToFix) {
+addUsersInExitBlocks(VPlan &Plan,
+ const SetVector<VPIRInstruction *> &ExitUsersToFix) {
if (ExitUsersToFix.empty())
return;
@@ -9179,12 +9179,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPlanTransforms::handleUncountableEarlyExit(*Plan, *PSE.getSE(), OrigLoop,
RecipeBuilder);
}
- addScalarResumePhis(RecipeBuilder, *Plan);
- SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
- addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
- addUsersInExitBlock(*Plan, ExitUsersToFix);
-
+ addScalarResumePhis(RecipeBuilder, *Plan);
+ SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
+ OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+ addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
+ addUsersInExitBlocks(*Plan, ExitUsersToFix);
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
// bring the VPlan to its final state.
>From 5cb0851d69513eee7c14f21b598a014925ec6ae1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 5 Nov 2024 15:36:04 +0000
Subject: [PATCH 7/9] !fixup address comments
---
.../Transforms/Vectorize/LoopVectorize.cpp | 62 ++---
.../Transforms/Vectorize/VPlanTransforms.cpp | 10 +-
.../LoopVectorize/X86/multi-exit-vplan.ll | 89 -------
.../X86/uncountable-early-exit-vplan.ll | 244 ++++++++++++++++++
4 files changed, 283 insertions(+), 122 deletions(-)
delete mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4ccbc8e95d1edf..5188aeed2f32b5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8845,11 +8845,6 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
continue;
auto *ExitVPBB = cast<VPIRBasicBlock>(VPB);
BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
- BasicBlock *ExitingBB = find_singleton<BasicBlock>(
- to_vector(predecessors(ExitBB)),
- [OrigLoop](BasicBlock *Pred, bool AllowRepeats) {
- return OrigLoop->contains(Pred) ? Pred : nullptr;
- });
for (VPRecipeBase &R : *ExitVPBB) {
auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
if (!ExitIRI)
@@ -8857,23 +8852,27 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
if (!ExitPhi)
break;
- Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
- VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
- // Exit values for inductions are computed and updated outside of VPlan
- // and independent of induction recipes.
- // TODO: Compute induction exit values in VPlan.
- if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
- !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
- isa<VPWidenPointerInductionRecipe>(V) ||
- (isa<Instruction>(IncomingValue) &&
- OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
- any_of(IncomingValue->users(), [&Inductions](User *U) {
- auto *P = dyn_cast<PHINode>(U);
- return P && Inductions.contains(P);
- })))
- continue;
- ExitUsersToFix.insert(ExitIRI);
- ExitIRI->addOperand(V);
+ for (BasicBlock *ExitingBB : predecessors(ExitBB)) {
+ if (!OrigLoop->contains(ExitingBB))
+ continue;
+ Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
+ VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
+ // Exit values for inductions are computed and updated outside of VPlan
+ // and independent of induction recipes.
+ // TODO: Compute induction exit values in VPlan.
+ if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
+ !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
+ isa<VPWidenPointerInductionRecipe>(V) ||
+ (isa<Instruction>(IncomingValue) &&
+ OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
+ any_of(IncomingValue->users(), [&Inductions](User *U) {
+ auto *P = dyn_cast<PHINode>(U);
+ return P && Inductions.contains(P);
+ })))
+ continue;
+ ExitUsersToFix.insert(ExitIRI);
+ ExitIRI->addOperand(V);
+ }
}
}
return ExitUsersToFix;
@@ -8887,23 +8886,26 @@ addUsersInExitBlocks(VPlan &Plan,
if (ExitUsersToFix.empty())
return;
- auto *MiddleVPBB = Plan.getMiddleBlock();
- VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
-
// Introduce extract for exiting values and update the VPIRInstructions
// modeling the corresponding LCSSA phis.
for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
+
VPValue *V = ExitIRI->getOperand(0);
// Pass live-in values used by exit phis directly through to their users in
// the exit block.
if (V->isLiveIn())
continue;
- LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
- VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
- {V, Plan.getOrAddLiveIn(ConstantInt::get(
- IntegerType::get(Ctx, 32), 1))});
- ExitIRI->setOperand(0, Ext);
+ for (VPBlockBase *PredVPB : ExitIRI->getParent()->getPredecessors()) {
+ auto *PredVPBB = cast<VPBasicBlock>(PredVPB);
+ VPBuilder B(PredVPBB, PredVPBB->getFirstNonPhi());
+
+ LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
+ VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
+ {V, Plan.getOrAddLiveIn(ConstantInt::get(
+ IntegerType::get(Ctx, 32), 1))});
+ ExitIRI->setOperand(0, Ext);
+ }
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d336827b23ddf3..661b100d7881c2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1815,9 +1815,13 @@ void VPlanTransforms::handleUncountableEarlyExit(
auto *ExitingTerm = cast<BranchInst>(Exiting->getTerminator());
BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
- VPIRBasicBlock *VPExitBlock;
- VPExitBlock = VPIRBasicBlock::fromBasicBlock(
- !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+ VPBasicBlock *VPExitBlock;
+ if (OrigLoop->getUniqueExitBlock()) {
+ VPExitBlock = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[0]);
+ } else {
+ VPExitBlock = VPIRBasicBlock::fromBasicBlock(
+ !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+ }
VPValue *M = RecipeBuilder.getBlockInMask(
OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
deleted file mode 100644
index 5da97ef8f9b3bc..00000000000000
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
-
-declare void @init(ptr)
-
-define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
-; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<128> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
-; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
-; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
-; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
-; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
-; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
-; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.split
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.split:
-; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]>
-; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e1>:
-; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
-; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e2>:
-; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: ir-bb<loop.header>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-; CHECK-NEXT: IR %c.1 = icmp uge i64 %iv, %N
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-entry:
- %src = alloca [128 x i32]
- call void @init(ptr %src)
- br label %loop.header
-
-loop.header:
- %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
- %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
- %l = load i32, ptr %gep.src
- %c.1 = icmp eq i32 %l, 10
- br i1 %c.1, label %e1, label %loop.latch
-
-loop.latch:
- %inc = add nuw i64 %iv, 1
- %c.2 = icmp eq i64 %inc, 128
- br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
-
-e1:
- %p1 = phi i64 [ 0, %loop.header ]
- ret i64 %p1
-
-e2:
- %p2 = phi i64 [ 1, %loop.latch ]
- ret i64 %p2
-}
-
-!1 = distinct !{!1, !2, !3}
-!2 = !{!"llvm.loop.vectorize.width", i32 4}
-!3 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll
new file mode 100644
index 00000000000000..13f5671f893651
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll
@@ -0,0 +1,244 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
+
+declare void @init(ptr)
+
+define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT: IR %c.1 = icmp uge i64 %iv, %N
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+entry:
+ %src = alloca [128 x i32]
+ call void @init(ptr %src)
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+ %l = load i32, ptr %gep.src
+ %c.1 = icmp eq i32 %l, 10
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e2, label %loop.header
+
+e1:
+ %p1 = phi i64 [ 0, %loop.header ]
+ ret i64 %p1
+
+e2:
+ %p2 = phi i64 [ 1, %loop.latch ]
+ ret i64 %p2
+}
+
+define i64 @multi_exiting_to_different_exits_load_exit_value() {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT: IR %c.1 = icmp uge i64 %iv, %N
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+entry:
+ %src = alloca [128 x i64]
+ call void @init(ptr %src)
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+ %l = load i64, ptr %gep.src
+ %c.1 = icmp eq i64 %l, 10
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e2, label %loop.header
+
+e1:
+ %p1 = phi i64 [ %l, %loop.header ]
+ ret i64 %p1
+
+e2:
+ %p2 = phi i64 [ 1, %loop.latch ]
+ ret i64 %p2
+}
+
+define i64 @multi_exiting_to_same_exit_load_exit_value() {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT: IR %c.1 = icmp uge i64 %iv, %N
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+entry:
+ %src = alloca [128 x i64]
+ call void @init(ptr %src)
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+ %l = load i64, ptr %gep.src
+ %l.2 = load i64, ptr %gep.src
+ %c.1 = icmp eq i64 %l, 10
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e1, label %loop.header
+
+e1:
+ %p1 = phi i64 [ %l, %loop.header ], [ %l.2, %loop.latch ]
+ ret i64 %p1
+}
>From e849195a4994293aab10fb9786dc006064374ac0 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 5 Nov 2024 20:48:45 +0000
Subject: [PATCH 8/9] !fixup address more comments.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 27 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 4 +
llvm/lib/Transforms/Vectorize/VPlanCFG.h | 9 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 -
.../Transforms/Vectorize/VPlanTransforms.cpp | 13 +-
.../Transforms/Vectorize/VPlanTransforms.h | 13 +-
.../LoopVectorize/X86/multi-exit-codegen.ll | 240 -----------------
.../X86/uncountable-early-exit-vplan.ll | 244 ------------------
.../uncountable-early-exit-vplan.ll | 171 ++++++++++++
9 files changed, 218 insertions(+), 504 deletions(-)
delete mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
delete mode 100644 llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5188aeed2f32b5..1c02db88c3f3c8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8839,11 +8839,7 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
SetVector<VPIRInstruction *> ExitUsersToFix;
- for (VPBlockBase *VPB : vp_depth_first_shallow(
- Plan.getVectorLoopRegion()->getSingleSuccessor())) {
- if (VPB->getNumSuccessors() != 0 || VPB == Plan.getScalarHeader())
- continue;
- auto *ExitVPBB = cast<VPIRBasicBlock>(VPB);
+ for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
for (VPRecipeBase &R : *ExitVPBB) {
auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
@@ -9178,14 +9174,31 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
RecipeBuilder.fixHeaderPhis();
if (Legal->hasUncountableEarlyExit()) {
- VPlanTransforms::handleUncountableEarlyExit(*Plan, *PSE.getSE(), OrigLoop,
- RecipeBuilder);
+ VPlanTransforms::handleUncountableEarlyExit(
+ *Plan, *PSE.getSE(), OrigLoop, Legal->getUncountableExitingBlocks(),
+ RecipeBuilder);
}
addScalarResumePhis(RecipeBuilder, *Plan);
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
addUsersInExitBlocks(*Plan, ExitUsersToFix);
+
+ // Currently only live-ins can be used by exit values. We also bail out if any
+ // exit value isn't handled in VPlan yet, i.e. a VPIRInstruction in the exit
+ // without any operands.
+ if (Legal->hasUncountableEarlyExit()) {
+ if (any_of(Plan->getExitBlocks(), [](VPIRBasicBlock *ExitBB) {
+ return any_of(*ExitBB, [](VPRecipeBase &R) {
+ auto VPIRI = cast<VPIRInstruction>(&R);
+ return VPIRI->getNumOperands() == 0 ||
+ any_of(VPIRI->operands(),
+ [](VPValue *Op) { return !Op->isLiveIn(); });
+ });
+ }))
+ return nullptr;
+ }
+
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
// bring the VPlan to its final state.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 44fd49fed6ad90..34a3a180bfa52a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3832,6 +3832,10 @@ class VPlan {
return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
}
+ /// Return the exit blocks of the VPlan, that is leaf nodes except the scalar
+ /// header.
+ auto getExitBlocks();
+
/// The trip count of the original loop.
VPValue *getTripCount() const {
assert(TripCount && "trip count needs to be set before accessing it");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanCFG.h b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
index 89e2e7514dac2b..6ca388a953a6ff 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanCFG.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
@@ -306,6 +306,15 @@ template <> struct GraphTraits<VPlan *> {
}
};
+inline auto VPlan::getExitBlocks() {
+ VPBlockBase *ScalarHeader = getScalarHeader();
+ return make_filter_range(
+ VPBlockUtils::blocksOnly<VPIRBasicBlock>(
+ vp_depth_first_shallow(getVectorLoopRegion()->getSingleSuccessor())),
+ [ScalarHeader](VPIRBasicBlock *VPIRBB) {
+ return VPIRBB != ScalarHeader && VPIRBB->getNumSuccessors() == 0;
+ });
+}
} // namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANCFG_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7f582ce8c99433..a1a0c2ffcf0597 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -872,7 +872,6 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
O << Indent << "IR " << I;
if (getNumOperands() != 0) {
- // assert(getNumOperands() == 1 && "can have at most 1 operand");
O << " (extra operand: ";
printOperands(O, SlotTracker);
O << ")";
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 661b100d7881c2..ed37e492b34fb1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1793,31 +1793,26 @@ void VPlanTransforms::createInterleaveGroups(
void VPlanTransforms::handleUncountableEarlyExit(
VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
+ ArrayRef<BasicBlock *> UncountableExitingBlocks,
VPRecipeBuilder &RecipeBuilder) {
auto *LatchVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
VPBuilder Builder(LatchVPBB->getTerminator());
auto *MiddleVPBB = Plan.getMiddleBlock();
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
-
VPValue *EarlyExitTaken = nullptr;
- SmallVector<BasicBlock *> ExitingBBs;
- OrigLoop->getExitingBlocks(ExitingBBs);
// Process all uncountable exiting blocks. For each exiting block, update the
// EarlyExitTaken, which tracks if any uncountable early exit has been taken.
// Also split the middle block and branch to the exit block for the early exit
// if it has been taken.
- for (BasicBlock *Exiting : ExitingBBs) {
- if (Exiting == OrigLoop->getLoopLatch())
- continue;
-
+ for (BasicBlock *Exiting : UncountableExitingBlocks) {
auto *ExitingTerm = cast<BranchInst>(Exiting->getTerminator());
BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
- VPBasicBlock *VPExitBlock;
+ VPIRBasicBlock *VPExitBlock;
if (OrigLoop->getUniqueExitBlock()) {
- VPExitBlock = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[0]);
+ VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
} else {
VPExitBlock = VPIRBasicBlock::fromBasicBlock(
!OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index f7bbae25279fce..dc5dec2f1b84a6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -124,9 +124,16 @@ struct VPlanTransforms {
/// Remove dead recipes from \p Plan.
static void removeDeadRecipes(VPlan &Plan);
- static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE,
- Loop *OrigLoop,
- VPRecipeBuilder &RecipeBuilder);
+ /// Update \p Plan to account for uncountable exit blocks in \p
+ /// UncountableExitingBlocks by
+ /// * updating the condition to exit the vector loop to include the early
+ /// exit conditions
+ /// * splitting the original middle block to branch to the early exit blocks
+ /// if taken. Returns false if the transformation wasn't successful.
+ static void
+ handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
+ ArrayRef<BasicBlock *> UncountableExitingBlocks,
+ VPRecipeBuilder &RecipeBuilder);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
deleted file mode 100644
index 0c33715c6bd271..00000000000000
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
+++ /dev/null
@@ -1,240 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization %s | FileCheck --check-prefix=MULTI %s
-; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization=false %s | FileCheck --check-prefix=DEFAULT %s
-
-define i64 @multi_exit_with_store(ptr %p, i64 %N) {
-; MULTI-LABEL: define i64 @multi_exit_with_store(
-; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; MULTI-NEXT: [[ENTRY:.*]]:
-; MULTI-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; MULTI: [[VECTOR_PH]]:
-; MULTI-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
-; MULTI-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; MULTI-NEXT: br label %[[VECTOR_BODY:.*]]
-; MULTI: [[VECTOR_BODY]]:
-; MULTI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; MULTI-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; MULTI-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; MULTI-NEXT: [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; MULTI-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; MULTI-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
-; MULTI-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
-; MULTI-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]])
-; MULTI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; MULTI-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
-; MULTI-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; MULTI-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; MULTI-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; MULTI-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
-; MULTI-NEXT: br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; MULTI: [[MIDDLE_SPLIT]]:
-; MULTI-NEXT: br i1 [[TMP6]], label %[[E1:.*]], label %[[MIDDLE_BLOCK:.*]]
-; MULTI: [[MIDDLE_BLOCK]]:
-; MULTI-NEXT: br i1 true, label %[[E2:.*]], label %[[SCALAR_PH]]
-; MULTI: [[SCALAR_PH]]:
-; MULTI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; MULTI-NEXT: br label %[[LOOP_HEADER:.*]]
-; MULTI: [[LOOP_HEADER]]:
-; MULTI-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; MULTI-NEXT: [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
-; MULTI-NEXT: br i1 [[CMP1]], label %[[E1]], label %[[LOOP_LATCH]]
-; MULTI: [[LOOP_LATCH]]:
-; MULTI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
-; MULTI-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
-; MULTI-NEXT: [[INC]] = add nuw i64 [[I_07]], 1
-; MULTI-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
-; MULTI-NEXT: br i1 [[CMP_NOT]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
-; MULTI: [[E1]]:
-; MULTI-NEXT: ret i64 0
-; MULTI: [[E2]]:
-; MULTI-NEXT: ret i64 1
-;
-; DEFAULT-LABEL: define i64 @multi_exit_with_store(
-; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; DEFAULT-NEXT: [[ENTRY:.*]]:
-; DEFAULT-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
-; DEFAULT-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1
-; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 4
-; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; DEFAULT: [[VECTOR_PH]]:
-; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
-; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; DEFAULT-NEXT: [[TMP2:%.*]] = select i1 [[TMP5]], i64 4, i64 [[N_MOD_VF]]
-; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP2]]
-; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
-; DEFAULT: [[VECTOR_BODY]]:
-; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]]
-; DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; DEFAULT-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP3]], align 4
-; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; DEFAULT: [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT: br label %[[SCALAR_PH]]
-; DEFAULT: [[SCALAR_PH]]:
-; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]]
-; DEFAULT: [[LOOP_HEADER]]:
-; DEFAULT-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; DEFAULT-NEXT: [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
-; DEFAULT-NEXT: br i1 [[CMP1]], label %[[E1:.*]], label %[[LOOP_LATCH]]
-; DEFAULT: [[LOOP_LATCH]]:
-; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
-; DEFAULT-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
-; DEFAULT-NEXT: [[INC]] = add nuw i64 [[I_07]], 1
-; DEFAULT-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
-; DEFAULT-NEXT: br i1 [[CMP_NOT]], label %[[E2:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
-; DEFAULT: [[E1]]:
-; DEFAULT-NEXT: ret i64 0
-; DEFAULT: [[E2]]:
-; DEFAULT-NEXT: ret i64 1
-;
-entry:
- br label %loop.header
-
-loop.header:
- %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
- %c.1 = icmp uge i64 %iv, %N
- br i1 %c.1, label %e1, label %loop.latch
-
-loop.latch:
- %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
- store i32 0, ptr %arrayidx
- %inc = add nuw i64 %iv, 1
- %c.2 = icmp eq i64 %inc, 128
- br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
-
-e1:
- ret i64 0
-
-e2:
- ret i64 1
-}
-
-define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
-; MULTI-LABEL: define i64 @multi_exiting_to_same_exit_with_store(
-; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; MULTI-NEXT: [[ENTRY:.*]]:
-; MULTI-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; MULTI: [[VECTOR_PH]]:
-; MULTI-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
-; MULTI-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; MULTI-NEXT: br label %[[VECTOR_BODY:.*]]
-; MULTI: [[VECTOR_BODY]]:
-; MULTI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; MULTI-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; MULTI-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; MULTI-NEXT: [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; MULTI-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; MULTI-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
-; MULTI-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
-; MULTI-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]])
-; MULTI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; MULTI-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
-; MULTI-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; MULTI-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; MULTI-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; MULTI-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
-; MULTI-NEXT: br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; MULTI: [[MIDDLE_SPLIT]]:
-; MULTI-NEXT: br i1 [[TMP6]], label %[[E:.*]], label %[[MIDDLE_BLOCK:.*]]
-; MULTI: [[MIDDLE_BLOCK]]:
-; MULTI-NEXT: br i1 true, label %[[E]], label %[[SCALAR_PH]]
-; MULTI: [[SCALAR_PH]]:
-; MULTI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; MULTI-NEXT: br label %[[LOOP_HEADER:.*]]
-; MULTI: [[LOOP_HEADER]]:
-; MULTI-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; MULTI-NEXT: [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]]
-; MULTI-NEXT: br i1 [[C_1]], label %[[E]], label %[[LOOP_LATCH]]
-; MULTI: [[LOOP_LATCH]]:
-; MULTI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
-; MULTI-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
-; MULTI-NEXT: [[INC]] = add nuw i64 [[IV]], 1
-; MULTI-NEXT: [[C_2:%.*]] = icmp eq i64 [[INC]], 128
-; MULTI-NEXT: br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
-; MULTI: [[E]]:
-; MULTI-NEXT: [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ 1, %[[MIDDLE_SPLIT]] ]
-; MULTI-NEXT: ret i64 [[P1]]
-;
-; DEFAULT-LABEL: define i64 @multi_exiting_to_same_exit_with_store(
-; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; DEFAULT-NEXT: [[ENTRY:.*]]:
-; DEFAULT-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
-; DEFAULT-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[UMIN]], 1
-; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
-; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; DEFAULT: [[VECTOR_PH]]:
-; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
-; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; DEFAULT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]]
-; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
-; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
-; DEFAULT: [[VECTOR_BODY]]:
-; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; DEFAULT-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP3]]
-; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; DEFAULT-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
-; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; DEFAULT: [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT: br label %[[SCALAR_PH]]
-; DEFAULT: [[SCALAR_PH]]:
-; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]]
-; DEFAULT: [[LOOP_HEADER]]:
-; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; DEFAULT-NEXT: [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]]
-; DEFAULT-NEXT: br i1 [[C_1]], label %[[E:.*]], label %[[LOOP_LATCH]]
-; DEFAULT: [[LOOP_LATCH]]:
-; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
-; DEFAULT-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
-; DEFAULT-NEXT: [[INC]] = add nuw i64 [[IV]], 1
-; DEFAULT-NEXT: [[C_2:%.*]] = icmp eq i64 [[INC]], 128
-; DEFAULT-NEXT: br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
-; DEFAULT: [[E]]:
-; DEFAULT-NEXT: [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ]
-; DEFAULT-NEXT: ret i64 [[P1]]
-;
-entry:
- br label %loop.header
-
-loop.header:
- %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
- %c.1 = icmp uge i64 %iv, %N
- br i1 %c.1, label %e, label %loop.latch
-
-loop.latch:
- %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
- store i32 0, ptr %arrayidx
- %inc = add nuw i64 %iv, 1
- %c.2 = icmp eq i64 %inc, 128
- br i1 %c.2, label %e, label %loop.header, !llvm.loop !1
-
-e:
- %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
- ret i64 %p1
-}
-
-!1 = distinct !{!1, !2, !3}
-!2 = !{!"llvm.loop.vectorize.width", i32 4}
-!3 = !{!"llvm.loop.vectorize.enable", i1 true}
-;.
-; MULTI: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; MULTI: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; MULTI: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; MULTI: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; MULTI: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; MULTI: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-;.
-; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll
deleted file mode 100644
index 13f5671f893651..00000000000000
--- a/llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll
+++ /dev/null
@@ -1,244 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
-
-declare void @init(ptr)
-
-define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
-; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<128> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
-; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
-; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
-; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
-; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
-; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
-; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.split
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.split:
-; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]>
-; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e1>:
-; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
-; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e2>:
-; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: ir-bb<loop.header>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-; CHECK-NEXT: IR %c.1 = icmp uge i64 %iv, %N
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-entry:
- %src = alloca [128 x i32]
- call void @init(ptr %src)
- br label %loop.header
-
-loop.header:
- %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
- %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
- %l = load i32, ptr %gep.src
- %c.1 = icmp eq i32 %l, 10
- br i1 %c.1, label %e1, label %loop.latch
-
-loop.latch:
- %inc = add nuw i64 %iv, 1
- %c.2 = icmp eq i64 %inc, 128
- br i1 %c.2, label %e2, label %loop.header
-
-e1:
- %p1 = phi i64 [ 0, %loop.header ]
- ret i64 %p1
-
-e2:
- %p2 = phi i64 [ 1, %loop.latch ]
- ret i64 %p2
-}
-
-define i64 @multi_exiting_to_different_exits_load_exit_value() {
-; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<128> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
-; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
-; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
-; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
-; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
-; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
-; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.split
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.split:
-; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]>
-; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e1>:
-; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
-; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e2>:
-; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: ir-bb<loop.header>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-; CHECK-NEXT: IR %c.1 = icmp uge i64 %iv, %N
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-entry:
- %src = alloca [128 x i64]
- call void @init(ptr %src)
- br label %loop.header
-
-loop.header:
- %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
- %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
- %l = load i64, ptr %gep.src
- %c.1 = icmp eq i64 %l, 10
- br i1 %c.1, label %e1, label %loop.latch
-
-loop.latch:
- %inc = add nuw i64 %iv, 1
- %c.2 = icmp eq i64 %inc, 128
- br i1 %c.2, label %e2, label %loop.header
-
-e1:
- %p1 = phi i64 [ %l, %loop.header ]
- ret i64 %p1
-
-e2:
- %p2 = phi i64 [ 1, %loop.latch ]
- ret i64 %p2
-}
-
-define i64 @multi_exiting_to_same_exit_load_exit_value() {
-; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<128> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
-; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
-; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
-; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
-; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
-; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
-; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
-; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
-; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
-; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.split
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.split:
-; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]>
-; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e1>:
-; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
-; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e2>:
-; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: ir-bb<loop.header>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-; CHECK-NEXT: IR %c.1 = icmp uge i64 %iv, %N
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-entry:
- %src = alloca [128 x i64]
- call void @init(ptr %src)
- br label %loop.header
-
-loop.header:
- %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
- %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
- %l = load i64, ptr %gep.src
- %l.2 = load i64, ptr %gep.src
- %c.1 = icmp eq i64 %l, 10
- br i1 %c.1, label %e1, label %loop.latch
-
-loop.latch:
- %inc = add nuw i64 %iv, 1
- %c.2 = icmp eq i64 %inc, 128
- br i1 %c.2, label %e1, label %loop.header
-
-e1:
- %p1 = phi i64 [ %l, %loop.header ], [ %l.2, %loop.latch ]
- ret i64 %p1
-}
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
new file mode 100644
index 00000000000000..d840646a259529
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
+
+declare void @init(ptr)
+
+define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: IR %src = alloca [128 x i32], align 4
+; CHECK-NEXT: IR call void @init(ptr %src)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<%3>
+; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.src>
+; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT: WIDEN ir<%c.1> = icmp eq ir<%l>, ir<10>
+; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK: No successors
+; CHECK-NEXT: }
+entry:
+ %src = alloca [128 x i32]
+ call void @init(ptr %src)
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+ %l = load i32, ptr %gep.src
+ %c.1 = icmp eq i32 %l, 10
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e2, label %loop.header
+
+e1:
+ %p1 = phi i64 [ 0, %loop.header ]
+ ret i64 %p1
+
+e2:
+ %p2 = phi i64 [ 1, %loop.latch ]
+ ret i64 %p2
+}
+
+define i64 @multi_exiting_to_different_exits_load_exit_value() {
+; CHECK-NOT: VPlan 'Final VPlan for VF={4},UF={1}' {
+entry:
+ %src = alloca [128 x i64]
+ call void @init(ptr %src)
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+ %l = load i64, ptr %gep.src
+ %c.1 = icmp eq i64 %l, 10
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e2, label %loop.header
+
+e1:
+ %p1 = phi i64 [ %l, %loop.header ]
+ ret i64 %p1
+
+e2:
+ %p2 = phi i64 [ 1, %loop.latch ]
+ ret i64 %p2
+}
+
+define i64 @multi_exiting_to_same_exit_load_exit_value() {
+; CHECK-NOT: VPlan 'Final VPlan for VF={4},UF={1}' {
+
+entry:
+ %src = alloca [128 x i64]
+ call void @init(ptr %src)
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+ %l = load i64, ptr %gep.src
+ %l.2 = load i64, ptr %gep.src
+ %c.1 = icmp eq i64 %l, 10
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e1, label %loop.header
+
+e1:
+ %p1 = phi i64 [ %l, %loop.header ], [ %l.2, %loop.latch ]
+ ret i64 %p1
+}
+
+define i64 @multi_exiting_to_different_exits_induction_exit_value() {
+; CHECK-NOT: VPlan 'Final VPlan for VF={4},UF={1}' {
+entry:
+ %src = alloca [128 x i64]
+ call void @init(ptr %src)
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+ %l = load i64, ptr %gep.src
+ %c.1 = icmp eq i64 %l, 10
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e2, label %loop.header
+
+e1:
+ %p1 = phi i64 [ %iv, %loop.header ]
+ ret i64 %p1
+
+e2:
+ %p2 = phi i64 [ 1, %loop.latch ]
+ ret i64 %p2
+}
+
+
>From c53eca6a59f55edc3dcb2bb692956de2fd97bfce Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 6 Nov 2024 20:40:41 +0000
Subject: [PATCH 9/9] !fixup remove left over canVectorizeEarlyExit
---
.../Vectorize/LoopVectorizationLegality.h | 3 ---
.../Vectorize/LoopVectorizationLegality.cpp | 19 -------------------
2 files changed, 22 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index af6fae44cf0f09..dc7e484a40a452 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -287,9 +287,6 @@ class LoopVectorizationLegality {
/// we can use in-order reductions.
bool canVectorizeFPMath(bool EnableStrictReductions);
- /// Returns true if the loop has an early exit that we can vectorize.
- bool canVectorizeEarlyExit() const;
-
/// Return true if we can vectorize this loop while folding its tail by
/// masking.
bool canFoldTailByMasking() const;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index ed3808d2f30bf1..0267fb1adb16d6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1515,25 +1515,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
return true;
}
-bool LoopVectorizationLegality::canVectorizeEarlyExit() const {
- // Currently only allow vectorizing loops with early exits, if early-exit
- // vectorization is explicitly enabled and the loop has metadata to force
- // vectorization.
-
- SmallVector<BasicBlock *> Exiting;
- TheLoop->getExitingBlocks(Exiting);
- if (Exiting.size() == 1)
- return false;
-
- LoopVectorizeHints Hints(TheLoop, true, *ORE);
- if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
- return false;
-
- Function *Fn = TheLoop->getHeader()->getParent();
- return Hints.allowVectorization(Fn, TheLoop,
- true /*VectorizeOnlyWhenForced*/);
-}
-
// Helper function to canVectorizeLoopNestCFG.
bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
bool UseVPlanNativePath) {
More information about the llvm-commits
mailing list