[llvm] [VPlan] Dispatch to multiple exit blocks via middle blocks. (PR #112138)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 4 13:11:04 PST 2024
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/112138
>From 245b56a80bca6369a9be3102308617f2a4a4d51b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 1 Oct 2024 20:57:24 +0100
Subject: [PATCH 1/4] [VPlan] Support VPIRBBs and VPIRInst phis with multiple
predecessors.
---
llvm/lib/Transforms/Vectorize/VPlan.cpp | 5 ++++-
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 9 +++++----
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c1b97791331bcf..8609514c39e7d0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1056,7 +1056,10 @@ void VPlan::execute(VPTransformState *State) {
State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}});
// Generate code in the loop pre-header and body.
- for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ Entry);
+
+ for (VPBlockBase *Block : RPOT)
Block->execute(State);
VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 41f13cc2d9a978..be3e958320e771 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -857,12 +857,13 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
void VPIRInstruction::execute(VPTransformState &State) {
assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
"Only PHINodes can have extra operands");
- if (getNumOperands() == 1) {
- VPValue *ExitValue = getOperand(0);
+ for (const auto &[Idx, Op] : enumerate(operands())) {
+ VPValue *ExitValue = Op;
auto Lane = vputils::isUniformAfterVectorization(ExitValue)
? VPLane::getFirstLane()
: VPLane::getLastLaneForVF(State.VF);
- auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
+ VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
+ auto *PredVPBB = Pred->getExitingBasicBlock();
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
// Set insertion point in PredBB in case an extract needs to be generated.
// TODO: Model extracts explicitly.
@@ -890,7 +891,7 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
O << Indent << "IR " << I;
if (getNumOperands() != 0) {
- assert(getNumOperands() == 1 && "can have at most 1 operand");
+ // assert(getNumOperands() == 1 && "can have at most 1 operand");
O << " (extra operand: ";
printOperands(O, SlotTracker);
O << ")";
>From 47258deea863675e43fd7fd48376dce131441dc5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 18 Sep 2024 21:35:57 +0100
Subject: [PATCH 2/4] [VPlan] Dispatch to multiple exit blocks via middle
blocks.
A more lightweight variant of https://github.com/llvm/llvm-project/pull/109193,
which dispatches to multiple exit blocks via the middle blocks.
---
.../Vectorize/LoopVectorizationLegality.h | 3 +
.../Vectorize/LoopVectorizationLegality.cpp | 29 +++
.../Transforms/Vectorize/LoopVectorize.cpp | 82 +++---
llvm/lib/Transforms/Vectorize/VPlan.cpp | 39 ++-
llvm/lib/Transforms/Vectorize/VPlan.h | 1 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 16 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 82 ++++++
.../Transforms/Vectorize/VPlanTransforms.h | 4 +
.../Transforms/Vectorize/VPlanVerifier.cpp | 8 -
.../LoopVectorize/X86/multi-exit-codegen.ll | 240 ++++++++++++++++++
.../LoopVectorize/X86/multi-exit-cost.ll | 18 +-
.../LoopVectorize/X86/multi-exit-vplan.ll | 148 +++++++++++
12 files changed, 614 insertions(+), 56 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index dc7e484a40a452..af6fae44cf0f09 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -287,6 +287,9 @@ class LoopVectorizationLegality {
/// we can use in-order reductions.
bool canVectorizeFPMath(bool EnableStrictReductions);
+ /// Returns true if the loop has an early exit that we can vectorize.
+ bool canVectorizeEarlyExit() const;
+
/// Return true if we can vectorize this loop while folding its tail by
/// masking.
bool canFoldTailByMasking() const;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 43be72f0f34d45..ee53d28a4c8282 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -43,6 +43,10 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
cl::desc("Enable recognition of non-constant strided "
"pointer induction variables."));
+static cl::opt<bool>
+ EnableEarlyExitVectorization("enable-early-exit-vectorization",
+ cl::init(false), cl::Hidden, cl::desc(""));
+
namespace llvm {
cl::opt<bool>
HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
@@ -1378,6 +1382,10 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
}
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
+ // When vectorizing early exits, create predicates for all blocks, except the
+ // header.
+ if (canVectorizeEarlyExit() && BB != TheLoop->getHeader())
+ return true;
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}
@@ -1514,6 +1522,27 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
return true;
}
+bool LoopVectorizationLegality::canVectorizeEarlyExit() const {
+ // Currently only allow vectorizing loops with early exits, if early-exit
+ // vectorization is explicitly enabled and the loop has metadata to force
+ // vectorization.
+ if (!EnableEarlyExitVectorization)
+ return false;
+
+ SmallVector<BasicBlock *> Exiting;
+ TheLoop->getExitingBlocks(Exiting);
+ if (Exiting.size() == 1)
+ return false;
+
+ LoopVectorizeHints Hints(TheLoop, true, *ORE);
+ if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+ return false;
+
+ Function *Fn = TheLoop->getHeader()->getParent();
+ return Hints.allowVectorization(Fn, TheLoop,
+ true /*VectorizeOnlyWhenForced*/);
+}
+
// Helper function to canVectorizeLoopNestCFG.
bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
bool UseVPlanNativePath) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8653498d32a12..befe8f7c0076a3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1363,9 +1363,11 @@ class LoopVectorizationCostModel {
// If we might exit from anywhere but the latch, must run the exiting
// iteration in scalar form.
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
- LLVM_DEBUG(
- dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
- return true;
+ if (!Legal->canVectorizeEarlyExit()) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+ return true;
+ }
}
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
@@ -2575,7 +2577,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
- assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
+ assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector()) ||
+ Legal->canVectorizeEarlyExit()) &&
"multiple exit loop without required epilogue?");
LoopMiddleBlock =
@@ -2758,8 +2761,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
// value (the value that feeds into the phi from the loop latch).
// We allow both, but they, obviously, have different values.
- assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
-
DenseMap<Value *, Value *> MissingVals;
// An external user of the last iteration's value should see the value that
@@ -2819,6 +2820,9 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
PHI->addIncoming(I.second, MiddleBlock);
}
+
+ assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) &&
+ "Expected a single exit block");
}
namespace {
@@ -3599,7 +3603,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
TheLoop->getExitingBlocks(Exiting);
for (BasicBlock *E : Exiting) {
auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
- if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+ if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() &&
+ (TheLoop->getLoopLatch() == E || !Legal->canVectorizeEarlyExit()))
AddToWorklistIfAllowed(Cmp);
}
@@ -7692,12 +7697,15 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan.execute(&State);
// 2.5 Collect reduction resume values.
- auto *ExitVPBB =
- cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
- for (VPRecipeBase &R : *ExitVPBB) {
- createAndCollectMergePhiForReduction(
- dyn_cast<VPInstruction>(&R), State, OrigLoop,
- State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+ VPBasicBlock *ExitVPBB = nullptr;
+ if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {
+ ExitVPBB = cast<VPBasicBlock>(
+ BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+ for (VPRecipeBase &R : *ExitVPBB) {
+ createAndCollectMergePhiForReduction(
+ dyn_cast<VPInstruction>(&R), State, OrigLoop,
+ State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+ }
}
// 2.6. Maintain Loop Hints
@@ -7723,6 +7731,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
LoopVectorizeHints Hints(L, true, *ORE);
Hints.setAlreadyVectorized();
}
+
TargetTransformInfo::UnrollingPreferences UP;
TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
@@ -7735,15 +7744,17 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
ILV.printDebugTracesAtEnd();
// 4. Adjust branch weight of the branch in the middle block.
- auto *MiddleTerm =
- cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
- if (MiddleTerm->isConditional() &&
- hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
- // Assume that `Count % VectorTripCount` is equally distributed.
- unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
- assert(TripCount > 0 && "trip count should not be zero");
- const uint32_t Weights[] = {1, TripCount - 1};
- setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+ if (ExitVPBB) {
+ auto *MiddleTerm =
+ cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
+ if (MiddleTerm->isConditional() &&
+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+ // Assume that `Count % VectorTripCount` is equally distributed.
+ unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
+ assert(TripCount > 0 && "trip count should not be zero");
+ const uint32_t Weights[] = {1, TripCount - 1};
+ setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+ }
}
return State.ExpandedSCEVs;
@@ -8128,7 +8139,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
// If source is an exiting block, we know the exit edge is dynamically dead
// in the vector loop, and thus we don't need to restrict the mask. Avoid
// adding uses of an otherwise potentially dead instruction.
- if (OrigLoop->isLoopExiting(Src))
+ if (!Legal->canVectorizeEarlyExit() && OrigLoop->isLoopExiting(Src))
return EdgeMaskCache[Edge] = SrcMask;
VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8778,6 +8789,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
static SetVector<VPIRInstruction *> collectUsersInExitBlock(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+ if (!Plan.getVectorLoopRegion()->getSingleSuccessor())
+ return {};
auto *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
// No edge from the middle block to the unique exit block has been inserted
@@ -8863,6 +8876,8 @@ static void addLiveOutsForFirstOrderRecurrences(
// TODO: Should be replaced by
// Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
// scalar region is modeled as well.
+ if (!VectorRegion->getSingleSuccessor())
+ return;
auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
@@ -9146,10 +9161,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();
- SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
- addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
- addUsersInExitBlock(*Plan, ExitUsersToFix);
+ if (Legal->canVectorizeEarlyExit()) {
+ VPlanTransforms::convertToMultiCond(*Plan, *PSE.getSE(), OrigLoop,
+ RecipeBuilder);
+ } else {
+ SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
+ OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+ addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
+ addUsersInExitBlock(*Plan, ExitUsersToFix);
+ }
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -9277,8 +9297,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
using namespace VPlanPatternMatch;
VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
- VPBasicBlock *MiddleVPBB =
- cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
for (VPRecipeBase &R : Header->phis()) {
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
@@ -9297,8 +9315,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (VPUser *U : Cur->users()) {
auto *UserRecipe = cast<VPSingleDefRecipe>(U);
if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
- assert(UserRecipe->getParent() == MiddleVPBB &&
- "U must be either in the loop region or the middle block.");
continue;
}
Worklist.insert(UserRecipe);
@@ -9403,6 +9419,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
}
VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
Builder.setInsertPoint(&*LatchVPBB->begin());
+ if (!VectorLoopRegion->getSingleSuccessor())
+ return;
+ VPBasicBlock *MiddleVPBB =
+ cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
for (VPRecipeBase &R :
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8609514c39e7d0..eb7c808551340d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -474,6 +474,14 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
// backedges. A backward successor is set when the branch is created.
const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+ if (TermBr->getSuccessor(idx) &&
+ PredVPBlock == getPlan()->getVectorLoopRegion() &&
+ PredVPBlock->getNumSuccessors()) {
+ // Update PRedBB and TermBr for BranchOnMultiCond in predecessor.
+ PredBB = TermBr->getSuccessor(1);
+ TermBr = cast<BranchInst>(PredBB->getTerminator());
+ idx = 0;
+ }
assert(!TermBr->getSuccessor(idx) &&
"Trying to reset an existing successor block.");
TermBr->setSuccessor(idx, IRBB);
@@ -908,8 +916,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
- VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
if (!RequiresScalarEpilogueCheck) {
+ VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
return Plan;
}
@@ -923,10 +931,14 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
// we unconditionally branch to the scalar preheader. Do nothing.
// 3) Otherwise, construct a runtime check.
BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
- auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
- // The connection order corresponds to the operands of the conditional branch.
- VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
- VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ if (IRExitBlock) {
+ auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+ // The connection order corresponds to the operands of the conditional
+ // branch.
+ VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
+ VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+ VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ }
auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
// Here we use the same DebugLoc as the scalar loop latch terminator instead
@@ -1031,7 +1043,9 @@ void VPlan::execute(VPTransformState *State) {
// VPlan execution rather than earlier during VPlan construction.
BasicBlock *MiddleBB = State->CFG.ExitBB;
VPBasicBlock *MiddleVPBB =
- cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ getVectorLoopRegion()->getNumSuccessors() == 1
+ ? cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[0])
+ : cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[1]);
// Find the VPBB for the scalar preheader, relying on the current structure
// when creating the middle block and its successrs: if there's a single
// predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1044,6 +1058,10 @@ void VPlan::execute(VPTransformState *State) {
MiddleSuccs.size() == 1 ? MiddleSuccs[0] : MiddleSuccs[1]);
assert(!isa<VPIRBasicBlock>(ScalarPhVPBB) &&
"scalar preheader cannot be wrapped already");
+ if (ScalarPhVPBB->getNumSuccessors() != 0) {
+ ScalarPhVPBB = cast<VPBasicBlock>(ScalarPhVPBB->getSuccessors()[1]);
+ MiddleVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
+ }
replaceVPBBWithIRVPBB(ScalarPhVPBB, ScalarPh);
replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
@@ -1065,6 +1083,10 @@ void VPlan::execute(VPTransformState *State) {
VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
+ if (!getVectorLoopRegion()->getSingleSuccessor())
+ VectorLatchBB =
+ cast<BranchInst>(VectorLatchBB->getTerminator())->getSuccessor(1);
+
// Fix the latch value of canonical, reduction and first-order recurrences
// phis in the vector loop.
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
@@ -1091,7 +1113,10 @@ void VPlan::execute(VPTransformState *State) {
// Move the last step to the end of the latch block. This ensures
// consistent placement of all induction updates.
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
- Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+ if (VectorLatchBB->getTerminator() == &*VectorLatchBB->getFirstNonPHI())
+ Inc->moveBefore(VectorLatchBB->getTerminator());
+ else
+ Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
// Use the steps for the last part as backedge value for the induction.
if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 59a084401cc9bf..21f44eac188936 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1274,6 +1274,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
PtrAdd,
+ AnyOf,
};
private:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index be3e958320e771..9d5c609ad26043 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -67,6 +67,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
default:
return true;
}
+ case VPExpandSCEVSC:
+ return getParent()->getPlan()->getTripCount() == getVPSingleValue();
case VPInterleaveSC:
return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
case VPWidenStoreEVLSC:
@@ -160,6 +162,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPPredInstPHISC:
case VPScalarCastSC:
return false;
+ case VPExpandSCEVSC:
+ return getParent()->getPlan()->getTripCount() == getVPSingleValue();
case VPInstructionSC:
return mayWriteToMemory();
case VPWidenCallSC: {
@@ -399,6 +403,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::PtrAdd:
case VPInstruction::ExplicitVectorLength:
+ case VPInstruction::AnyOf:
return true;
default:
return false;
@@ -674,6 +679,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
return NewPhi;
}
+ case VPInstruction::AnyOf: {
+ Value *A = State.get(getOperand(0));
+ return Builder.CreateOrReduce(A);
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
@@ -682,7 +691,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractFromEnd ||
- getOpcode() == VPInstruction::ComputeReductionResult;
+ getOpcode() == VPInstruction::ComputeReductionResult ||
+ getOpcode() == VPInstruction::AnyOf;
}
bool VPInstruction::isSingleScalar() const {
@@ -745,6 +755,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
return false;
case Instruction::ICmp:
case Instruction::Select:
+ case Instruction::Or:
case VPInstruction::PtrAdd:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
@@ -840,6 +851,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::PtrAdd:
O << "ptradd";
break;
+ case VPInstruction::AnyOf:
+ O << "any-of";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d50f3c0c3f3e04..a86498eb9aa30c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -515,6 +515,12 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
Plan.getEntry());
+ for (VPRecipeBase &R : make_early_inc_range(
+ reverse(*cast<VPBasicBlock>(Plan.getPreheader())))) {
+ if (isDeadRecipe(R))
+ R.eraseFromParent();
+ }
+
for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
// The recipes in the block are processed in reverse order, to catch chains
// of dead recipes.
@@ -1696,3 +1702,79 @@ void VPlanTransforms::createInterleaveGroups(
}
}
}
+
+void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
+ Loop *OrigLoop,
+ VPRecipeBuilder &RecipeBuilder) {
+ auto *LatchVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
+ VPBuilder Builder(LatchVPBB->getTerminator());
+ auto *MiddleVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+
+ const SCEV *BackedgeTakenCount =
+ SE.getExitCount(OrigLoop, OrigLoop->getLoopLatch());
+ const SCEV *TripCount = SE.getTripCountFromExitCount(
+ BackedgeTakenCount, Plan.getCanonicalIV()->getScalarType(), OrigLoop);
+ VPValue *NewTC = vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE);
+ Plan.getTripCount()->replaceAllUsesWith(NewTC);
+ Plan.resetTripCount(NewTC);
+
+ VPValue *EarlyExitTaken = nullptr;
+ SmallVector<BasicBlock *> ExitingBBs;
+ OrigLoop->getExitingBlocks(ExitingBBs);
+ for (BasicBlock *Exiting : ExitingBBs) {
+ auto *ExitingTerm = cast<BranchInst>(Exiting->getTerminator());
+ BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
+ BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
+ VPIRBasicBlock *VPExitBlock;
+ if (OrigLoop->getUniqueExitBlock())
+ VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
+ else
+ VPExitBlock = VPIRBasicBlock::fromBasicBlock(
+ !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+
+ for (VPRecipeBase &R : *VPExitBlock) {
+ auto *ExitIRI = cast<VPIRInstruction>(&R);
+ auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
+ if (!ExitPhi)
+ break;
+ Value *IncomingValue = ExitPhi->getIncomingValueForBlock(Exiting);
+ VPValue *V = RecipeBuilder.getVPValueOrAddLiveIn(IncomingValue);
+ ExitIRI->addOperand(V);
+ }
+
+ if (Exiting == OrigLoop->getLoopLatch()) {
+ if (MiddleVPBB->getNumSuccessors() == 0) {
+ VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+ VPBlockUtils::connectBlocks(MiddleVPBB, VPExitBlock);
+ VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ }
+ continue;
+ }
+
+ VPValue *M = RecipeBuilder.getBlockInMask(
+ OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+ auto *N = Builder.createNot(M);
+ EarlyExitTaken = Builder.createNaryOp(VPInstruction::AnyOf, {N});
+
+ VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split");
+ VPBlockUtils::disconnectBlocks(LoopRegion, MiddleVPBB);
+ VPBlockUtils::insertBlockAfter(NewMiddle, LoopRegion);
+ VPBlockUtils::connectBlocks(NewMiddle, VPExitBlock);
+ VPBlockUtils::connectBlocks(NewMiddle, MiddleVPBB);
+
+ VPBuilder MiddleBuilder(NewMiddle);
+ MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {EarlyExitTaken});
+ // MiddleVPBB = NewMiddle;
+ }
+ auto *Term = dyn_cast<VPInstruction>(LatchVPBB->getTerminator());
+ auto *IsLatchExiting = Builder.createICmp(
+ CmpInst::ICMP_EQ, Term->getOperand(0), Term->getOperand(1));
+ auto *AnyExiting =
+ Builder.createNaryOp(Instruction::Or, {EarlyExitTaken, IsLatchExiting});
+ Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExiting);
+ Term->eraseFromParent();
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 60a44bfb0dca6b..9745211db275f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -123,6 +123,10 @@ struct VPlanTransforms {
/// Remove dead recipes from \p Plan.
static void removeDeadRecipes(VPlan &Plan);
+
+ static void convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
+ Loop *OrigLoop,
+ VPRecipeBuilder &RecipeBuilder);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 7ea5ee341cc547..1ac79f8887ab46 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -248,14 +248,6 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
return false;
}
- VPBlockBase *MiddleBB =
- IRBB->getPlan()->getVectorLoopRegion()->getSingleSuccessor();
- if (IRBB != IRBB->getPlan()->getPreheader() &&
- IRBB->getSinglePredecessor() != MiddleBB) {
- errs() << "VPIRBasicBlock can only be used as pre-header or a successor of "
- "middle-block at the moment!\n";
- return false;
- }
return true;
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
new file mode 100644
index 00000000000000..0c33715c6bd271
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization %s | FileCheck --check-prefix=MULTI %s
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization=false %s | FileCheck --check-prefix=DEFAULT %s
+
+define i64 @multi_exit_with_store(ptr %p, i64 %N) {
+; MULTI-LABEL: define i64 @multi_exit_with_store(
+; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; MULTI-NEXT: [[ENTRY:.*]]:
+; MULTI-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; MULTI: [[VECTOR_PH]]:
+; MULTI-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; MULTI-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; MULTI-NEXT: br label %[[VECTOR_BODY:.*]]
+; MULTI: [[VECTOR_BODY]]:
+; MULTI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; MULTI-NEXT: [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; MULTI-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
+; MULTI-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
+; MULTI-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]])
+; MULTI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; MULTI-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; MULTI-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; MULTI-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; MULTI-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; MULTI-NEXT: br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; MULTI: [[MIDDLE_SPLIT]]:
+; MULTI-NEXT: br i1 [[TMP6]], label %[[E1:.*]], label %[[MIDDLE_BLOCK:.*]]
+; MULTI: [[MIDDLE_BLOCK]]:
+; MULTI-NEXT: br i1 true, label %[[E2:.*]], label %[[SCALAR_PH]]
+; MULTI: [[SCALAR_PH]]:
+; MULTI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MULTI-NEXT: br label %[[LOOP_HEADER:.*]]
+; MULTI: [[LOOP_HEADER]]:
+; MULTI-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; MULTI-NEXT: [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
+; MULTI-NEXT: br i1 [[CMP1]], label %[[E1]], label %[[LOOP_LATCH]]
+; MULTI: [[LOOP_LATCH]]:
+; MULTI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
+; MULTI-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+; MULTI-NEXT: [[INC]] = add nuw i64 [[I_07]], 1
+; MULTI-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
+; MULTI-NEXT: br i1 [[CMP_NOT]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; MULTI: [[E1]]:
+; MULTI-NEXT: ret i64 0
+; MULTI: [[E2]]:
+; MULTI-NEXT: ret i64 1
+;
+; DEFAULT-LABEL: define i64 @multi_exit_with_store(
+; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT: [[ENTRY:.*]]:
+; DEFAULT-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
+; DEFAULT-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 4
+; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT: [[VECTOR_PH]]:
+; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
+; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; DEFAULT-NEXT: [[TMP2:%.*]] = select i1 [[TMP5]], i64 4, i64 [[N_MOD_VF]]
+; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP2]]
+; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
+; DEFAULT: [[VECTOR_BODY]]:
+; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]]
+; DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; DEFAULT-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP3]], align 4
+; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT: [[MIDDLE_BLOCK]]:
+; DEFAULT-NEXT: br label %[[SCALAR_PH]]
+; DEFAULT: [[SCALAR_PH]]:
+; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]]
+; DEFAULT: [[LOOP_HEADER]]:
+; DEFAULT-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; DEFAULT-NEXT: [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
+; DEFAULT-NEXT: br i1 [[CMP1]], label %[[E1:.*]], label %[[LOOP_LATCH]]
+; DEFAULT: [[LOOP_LATCH]]:
+; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
+; DEFAULT-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+; DEFAULT-NEXT: [[INC]] = add nuw i64 [[I_07]], 1
+; DEFAULT-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
+; DEFAULT-NEXT: br i1 [[CMP_NOT]], label %[[E2:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT: [[E1]]:
+; DEFAULT-NEXT: ret i64 0
+; DEFAULT: [[E2]]:
+; DEFAULT-NEXT: ret i64 1
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %c.1 = icmp uge i64 %iv, %N
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+ store i32 0, ptr %arrayidx
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
+
+e1:
+ ret i64 0
+
+e2:
+ ret i64 1
+}
+
+define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
+; MULTI-LABEL: define i64 @multi_exiting_to_same_exit_with_store(
+; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; MULTI-NEXT: [[ENTRY:.*]]:
+; MULTI-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; MULTI: [[VECTOR_PH]]:
+; MULTI-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; MULTI-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; MULTI-NEXT: br label %[[VECTOR_BODY:.*]]
+; MULTI: [[VECTOR_BODY]]:
+; MULTI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; MULTI-NEXT: [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; MULTI-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
+; MULTI-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
+; MULTI-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]])
+; MULTI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; MULTI-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; MULTI-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; MULTI-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; MULTI-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; MULTI-NEXT: br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; MULTI: [[MIDDLE_SPLIT]]:
+; MULTI-NEXT: br i1 [[TMP6]], label %[[E:.*]], label %[[MIDDLE_BLOCK:.*]]
+; MULTI: [[MIDDLE_BLOCK]]:
+; MULTI-NEXT: br i1 true, label %[[E]], label %[[SCALAR_PH]]
+; MULTI: [[SCALAR_PH]]:
+; MULTI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MULTI-NEXT: br label %[[LOOP_HEADER:.*]]
+; MULTI: [[LOOP_HEADER]]:
+; MULTI-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; MULTI-NEXT: [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]]
+; MULTI-NEXT: br i1 [[C_1]], label %[[E]], label %[[LOOP_LATCH]]
+; MULTI: [[LOOP_LATCH]]:
+; MULTI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
+; MULTI-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+; MULTI-NEXT: [[INC]] = add nuw i64 [[IV]], 1
+; MULTI-NEXT: [[C_2:%.*]] = icmp eq i64 [[INC]], 128
+; MULTI-NEXT: br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; MULTI: [[E]]:
+; MULTI-NEXT: [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ 1, %[[MIDDLE_SPLIT]] ]
+; MULTI-NEXT: ret i64 [[P1]]
+;
+; DEFAULT-LABEL: define i64 @multi_exiting_to_same_exit_with_store(
+; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT: [[ENTRY:.*]]:
+; DEFAULT-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
+; DEFAULT-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
+; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT: [[VECTOR_PH]]:
+; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; DEFAULT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]]
+; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
+; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
+; DEFAULT: [[VECTOR_BODY]]:
+; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; DEFAULT-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP3]]
+; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; DEFAULT-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
+; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT: [[MIDDLE_BLOCK]]:
+; DEFAULT-NEXT: br label %[[SCALAR_PH]]
+; DEFAULT: [[SCALAR_PH]]:
+; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]]
+; DEFAULT: [[LOOP_HEADER]]:
+; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; DEFAULT-NEXT: [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]]
+; DEFAULT-NEXT: br i1 [[C_1]], label %[[E:.*]], label %[[LOOP_LATCH]]
+; DEFAULT: [[LOOP_LATCH]]:
+; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
+; DEFAULT-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+; DEFAULT-NEXT: [[INC]] = add nuw i64 [[IV]], 1
+; DEFAULT-NEXT: [[C_2:%.*]] = icmp eq i64 [[INC]], 128
+; DEFAULT-NEXT: br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT: [[E]]:
+; DEFAULT-NEXT: [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ]
+; DEFAULT-NEXT: ret i64 [[P1]]
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %c.1 = icmp uge i64 %iv, %N
+ br i1 %c.1, label %e, label %loop.latch
+
+loop.latch:
+ %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+ store i32 0, ptr %arrayidx
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e, label %loop.header, !llvm.loop !1
+
+e:
+ %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
+ ret i64 %p1
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; MULTI: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; MULTI: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; MULTI: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; MULTI: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; MULTI: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; MULTI: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
index cd128979fc1431..1c02f10753745c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
@@ -5,18 +5,18 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64
; CHECK-LABEL: define i64 @test_value_in_exit_compare_chain_used_outside(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[X:%.*]], i64 range(i64 1, 32) [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[N]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]]
-; CHECK-NEXT: [[UMIN2:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[X]])
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[UMIN2]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
-; CHECK: [[VECTOR_SCEVCHECK]]:
; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[N]], -1
; CHECK-NEXT: [[TMP4:%.*]] = freeze i64 [[TMP3]]
; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[X]])
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[UMIN]] to i1
-; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[UMIN]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK: [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT: [[TMP32:%.*]] = add nsw i64 [[N]], -1
+; CHECK-NEXT: [[TMP33:%.*]] = freeze i64 [[TMP32]]
+; CHECK-NEXT: [[UMIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP33]], i64 [[X]])
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[UMIN1]] to i1
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[UMIN1]], 1
; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
; CHECK-NEXT: br i1 [[TMP7]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
new file mode 100644
index 00000000000000..5c5d532b93bc89
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
+
+define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %c.1 = icmp uge i64 %iv, %N
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+ store i32 0, ptr %arrayidx
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
+
+e1:
+ %p1 = phi i64 [ 0, %loop.header ]
+ ret i64 %p1
+
+e2:
+ %p2 = phi i64 [ 1, %loop.latch ]
+ ret i64 %p2
+}
+
+define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT: EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT: EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT: EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e>:
+; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<0>, ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %c.1 = icmp uge i64 %iv, %N
+ br i1 %c.1, label %e, label %loop.latch
+
+loop.latch:
+ %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+ store i32 0, ptr %arrayidx
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e, label %loop.header, !llvm.loop !1
+
+e:
+ %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
+ ret i64 %p1
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
>From 3831acb97053230cb09f8316ce1ada17be50564c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 31 Oct 2024 19:48:40 +0000
Subject: [PATCH 3/4] !fixup address first set of comments, thanks!
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 13 ++-----------
llvm/lib/Transforms/Vectorize/VPlan.cpp | 16 +---------------
2 files changed, 3 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3a69750460f4d8..80a0fda81aeaf1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7711,16 +7711,13 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan.execute(&State);
// 2.5 Collect reduction resume values.
- VPBasicBlock *ExitVPBB = nullptr;
- if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {
- ExitVPBB = cast<VPBasicBlock>(
- BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+ VPBasicBlock *ExitVPBB =
+ cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
if (VectorizingEpilogue)
for (VPRecipeBase &R : *ExitVPBB) {
fixReductionScalarResumeWhenVectorizingEpilog(
&R, State, State.CFG.VPBB2IRBB[ExitVPBB]);
}
- }
// 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
@@ -8809,8 +8806,6 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
static SetVector<VPIRInstruction *> collectUsersInExitBlock(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
- if (!Plan.getVectorLoopRegion()->getSingleSuccessor())
- return {};
auto *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
// No edge from the middle block to the unique exit block has been inserted
@@ -8896,8 +8891,6 @@ static void addLiveOutsForFirstOrderRecurrences(
// TODO: Should be replaced by
// Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
// scalar region is modeled as well.
- if (!VectorRegion->getSingleSuccessor())
- return;
auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
@@ -9447,8 +9440,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
}
VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
Builder.setInsertPoint(&*LatchVPBB->begin());
- if (!VectorLoopRegion->getSingleSuccessor())
- return;
VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index be3579b4cd5dde..4032468b4f76da 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -474,14 +474,6 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
// backedges. A backward successor is set when the branch is created.
const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
- if (TermBr->getSuccessor(idx) &&
- PredVPBlock == getPlan()->getVectorLoopRegion() &&
- PredVPBlock->getNumSuccessors()) {
- // Update PRedBB and TermBr for BranchOnMultiCond in predecessor.
- PredBB = TermBr->getSuccessor(1);
- TermBr = cast<BranchInst>(PredBB->getTerminator());
- idx = 0;
- }
assert(!TermBr->getSuccessor(idx) &&
"Trying to reset an existing successor block.");
TermBr->setSuccessor(idx, IRBB);
@@ -1043,9 +1035,7 @@ void VPlan::execute(VPTransformState *State) {
// VPlan execution rather than earlier during VPlan construction.
BasicBlock *MiddleBB = State->CFG.ExitBB;
VPBasicBlock *MiddleVPBB =
- getVectorLoopRegion()->getNumSuccessors() == 1
- ? cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[0])
- : cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[1]);
+ cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
// Find the VPBB for the scalar preheader, relying on the current structure
// when creating the middle block and its successrs: if there's a single
// predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1083,10 +1073,6 @@ void VPlan::execute(VPTransformState *State) {
VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
- if (!getVectorLoopRegion()->getSingleSuccessor())
- VectorLatchBB =
- cast<BranchInst>(VectorLatchBB->getTerminator())->getSuccessor(1);
-
// Fix the latch value of canonical, reduction and first-order recurrences
// phis in the vector loop.
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
>From 64db0eea4073e1cdc3d394155754ed0653ca0c3d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 4 Nov 2024 21:10:24 +0000
Subject: [PATCH 4/4] !fixup clean up merge failures
---
llvm/lib/Transforms/Vectorize/VPlan.cpp | 18 ++++++++++--------
llvm/lib/Transforms/Vectorize/VPlan.h | 4 ++--
.../Transforms/Vectorize/VPlanTransforms.cpp | 11 ++---------
.../LoopVectorize/X86/multi-exit-vplan.ll | 11 ++++++++++-
4 files changed, 24 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 924b63bbd9639f..e2c063928e9906 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -911,7 +911,6 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader);
if (!RequiresScalarEpilogueCheck) {
- VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
return Plan;
}
@@ -925,14 +924,17 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
// we unconditionally branch to the scalar preheader. Do nothing.
// 3) Otherwise, construct a runtime check.
BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
- if (IRExitBlock) {
- auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
- // The connection order corresponds to the operands of the conditional
- // branch.
- VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
- VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
- VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ if (!IRExitBlock) {
+ auto *Term = cast<BranchInst>(TheLoop->getLoopLatch()->getTerminator());
+ IRExitBlock = TheLoop->contains(Term->getSuccessor(0))
+ ? Term->getSuccessor(1)
+ : Term->getSuccessor(0);
}
+ auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+ // The connection order corresponds to the operands of the conditional
+ // branch.
+ VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
+ VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
// Here we use the same DebugLoc as the scalar loop latch terminator instead
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 861fea7ff469ff..8efa648a7e1ea7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3822,10 +3822,10 @@ class VPlan {
/// whether to execute the scalar tail loop or the exit block from the loop
/// latch.
const VPBasicBlock *getMiddleBlock() const {
- return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
}
VPBasicBlock *getMiddleBlock() {
- return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
}
/// The trip count of the original loop.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index eed3efd666e712..1369693b01971c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1797,8 +1797,7 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
auto *LatchVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
VPBuilder Builder(LatchVPBB->getTerminator());
- auto *MiddleVPBB =
- cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+ auto *MiddleVPBB = Plan.getMiddleBlock();
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
@@ -1818,7 +1817,7 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
VPIRBasicBlock *VPExitBlock;
- if (OrigLoop->getUniqueExitBlock())
+ if (OrigLoop->getUniqueExitBlock() || Exiting == OrigLoop->getLoopLatch())
VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
else
VPExitBlock = VPIRBasicBlock::fromBasicBlock(
@@ -1835,11 +1834,6 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
}
if (Exiting == OrigLoop->getLoopLatch()) {
- if (MiddleVPBB->getNumSuccessors() == 0) {
- VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
- VPBlockUtils::connectBlocks(MiddleVPBB, VPExitBlock);
- VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
- }
continue;
}
@@ -1856,7 +1850,6 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
VPBuilder MiddleBuilder(NewMiddle);
MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {EarlyExitTaken});
- // MiddleVPBB = NewMiddle;
}
auto *Term = dyn_cast<VPInstruction>(LatchVPBB->getTerminator());
auto *IsLatchExiting = Builder.createICmp(
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
index 5c5d532b93bc89..47304c571bfcb1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
@@ -49,9 +49,13 @@ define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT: IR %c.1 = icmp uge i64 %iv, %N
; CHECK-NEXT: No successors
; CHECK-NEXT: }
-;
entry:
br label %loop.header
@@ -120,6 +124,11 @@ define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
; CHECK-NEXT: Successor(s): ir-bb<e>, scalar.ph
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT: IR %c.1 = icmp uge i64 %iv, %N
; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
More information about the llvm-commits
mailing list