[llvm] [VPlan] Dispatch to multiple exit blocks via middle blocks. (PR #112138)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Sun Oct 13 06:53:55 PDT 2024
https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/112138
A more lightweight variant of https://github.com/llvm/llvm-project/pull/109193,
which dispatches to multiple exit blocks via the middle blocks.
>From 79d5c6aa3d013ec31d193abc8c95c557baba78ee Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 1 Oct 2024 20:57:24 +0100
Subject: [PATCH 1/2] [VPlan] Support VPIRBBs and VPIRInst phis with multiple
predecessors.
---
llvm/lib/Transforms/Vectorize/VPlan.cpp | 5 ++++-
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 9 +++++----
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 5e3a6388094940..5a8469f800cada 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1060,7 +1060,10 @@ void VPlan::execute(VPTransformState *State) {
State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}});
// Generate code in the loop pre-header and body.
- for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ Entry);
+
+ for (VPBlockBase *Block : RPOT)
Block->execute(State);
VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2948ecc580edc0..68397d9f8bf5c3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -848,12 +848,13 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
void VPIRInstruction::execute(VPTransformState &State) {
assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
"Only PHINodes can have extra operands");
- if (getNumOperands() == 1) {
- VPValue *ExitValue = getOperand(0);
+ for (const auto &[Idx, Op] : enumerate(operands())) {
+ VPValue *ExitValue = Op;
auto Lane = vputils::isUniformAfterVectorization(ExitValue)
? VPLane::getFirstLane()
: VPLane::getLastLaneForVF(State.VF);
- auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
+ VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
+ auto *PredVPBB = Pred->getExitingBasicBlock();
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
// Set insertion point in PredBB in case an extract needs to be generated.
// TODO: Model extracts explicitly.
@@ -881,7 +882,7 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
O << Indent << "IR " << I;
if (getNumOperands() != 0) {
- assert(getNumOperands() == 1 && "can have at most 1 operand");
+ // assert(getNumOperands() == 1 && "can have at most 1 operand");
O << " (extra operand: ";
printOperands(O, SlotTracker);
O << ")";
>From e4c27f0516f4b7510820c69cbbc61cd6a479d28b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 18 Sep 2024 21:35:57 +0100
Subject: [PATCH 2/2] [VPlan] Dispatch to multiple exit blocks via middle
blocks.
A more lightweight variant of https://github.com/llvm/llvm-project/pull/109193,
which dispatches to multiple exit blocks via the middle blocks.
---
.../Vectorize/LoopVectorizationLegality.h | 2 +
.../Vectorize/LoopVectorizationLegality.cpp | 34 +++++
.../Transforms/Vectorize/LoopVectorize.cpp | 90 +++++++----
llvm/lib/Transforms/Vectorize/VPlan.cpp | 39 ++++-
llvm/lib/Transforms/Vectorize/VPlan.h | 1 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 25 ++-
.../Transforms/Vectorize/VPlanTransforms.cpp | 82 ++++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 4 +
.../Transforms/Vectorize/VPlanVerifier.cpp | 8 -
.../LoopVectorize/X86/multi-exit-codegen.ll | 126 +++++++++++++++
.../LoopVectorize/X86/multi-exit-cost.ll | 18 +--
.../LoopVectorize/X86/multi-exit-vplan.ll | 144 ++++++++++++++++++
12 files changed, 514 insertions(+), 59 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index dc7e484a40a452..19fa7b8ae9d6fe 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -287,6 +287,8 @@ class LoopVectorizationLegality {
/// we can use in-order reductions.
bool canVectorizeFPMath(bool EnableStrictReductions);
+ bool canVectorizeMultiCond() const;
+
/// Return true if we can vectorize this loop while folding its tail by
/// masking.
bool canFoldTailByMasking() const;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 43be72f0f34d45..4e76e11d80a723 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -43,6 +43,9 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
cl::desc("Enable recognition of non-constant strided "
"pointer induction variables."));
+static cl::opt<bool> EnableMultiCond("enable-multi-cond-vectorization",
+ cl::init(false), cl::Hidden, cl::desc(""));
+
namespace llvm {
cl::opt<bool>
HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
@@ -1378,6 +1381,8 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
}
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
+ if (canVectorizeMultiCond() && BB != TheLoop->getHeader())
+ return true;
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}
@@ -1514,6 +1519,35 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
return true;
}
+bool LoopVectorizationLegality::canVectorizeMultiCond() const {
+ if (!EnableMultiCond)
+ return false;
+ SmallVector<BasicBlock *> Exiting;
+ TheLoop->getExitingBlocks(Exiting);
+ if (Exiting.size() != 2 || Exiting[0] != TheLoop->getHeader() ||
+ Exiting[1] != TheLoop->getLoopLatch() ||
+ any_of(*TheLoop->getHeader(), [](Instruction &I) {
+ return I.mayReadFromMemory() || I.mayHaveSideEffects();
+ }))
+ return false;
+ CmpInst::Predicate Pred;
+ Value *A, *B;
+ if (!match(
+ TheLoop->getHeader()->getTerminator(),
+ m_Br(m_ICmp(Pred, m_Value(A), m_Value(B)), m_Value(), m_Value())) ||
+ Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE)
+ return false;
+ if (any_of(TheLoop->getBlocks(), [this](BasicBlock *BB) {
+ return any_of(*BB, [this](Instruction &I) {
+ return any_of(I.users(), [this](User *U) {
+ return !TheLoop->contains(cast<Instruction>(U)->getParent());
+ });
+ });
+ }))
+ return false;
+ return true;
+}
+
// Helper function to canVectorizeLoopNestCFG.
bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
bool UseVPlanNativePath) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 027ee21527d228..28b95d3b40562c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1362,9 +1362,11 @@ class LoopVectorizationCostModel {
// If we might exit from anywhere but the latch, must run the exiting
// iteration in scalar form.
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
- LLVM_DEBUG(
- dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
- return true;
+ if (!Legal->canVectorizeMultiCond()) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+ return true;
+ }
}
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
@@ -2535,8 +2537,17 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
- assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
- "multiple exit loop without required epilogue?");
+ if (Legal->canVectorizeMultiCond()) {
+ BasicBlock *Latch = OrigLoop->getLoopLatch();
+ BasicBlock *TrueSucc =
+ cast<BranchInst>(Latch->getTerminator())->getSuccessor(0);
+ BasicBlock *FalseSucc =
+ cast<BranchInst>(Latch->getTerminator())->getSuccessor(1);
+ LoopExitBlock = OrigLoop->contains(TrueSucc) ? FalseSucc : TrueSucc;
+ } else {
+ assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
+ "multiple exit loop without required epilogue?");
+ }
LoopMiddleBlock =
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
@@ -2910,7 +2921,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
for (PHINode &PN : Exit->phis())
PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
- if (Cost->requiresScalarEpilogue(VF.isVector())) {
+ if (Legal->canVectorizeMultiCond() ||
+ Cost->requiresScalarEpilogue(VF.isVector())) {
// No edge from the middle block to the unique exit block has been inserted
// and there is nothing to fix from vector loop; phis should have incoming
// from scalar loop only.
@@ -3554,7 +3566,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
TheLoop->getExitingBlocks(Exiting);
for (BasicBlock *E : Exiting) {
auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
- if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+ if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() &&
+ (TheLoop->getLoopLatch() == E || !Legal->canVectorizeMultiCond()))
AddToWorklistIfAllowed(Cmp);
}
@@ -7643,12 +7656,15 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan.execute(&State);
// 2.5 Collect reduction resume values.
- auto *ExitVPBB =
- cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
- for (VPRecipeBase &R : *ExitVPBB) {
- createAndCollectMergePhiForReduction(
- dyn_cast<VPInstruction>(&R), State, OrigLoop,
- State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+ VPBasicBlock *ExitVPBB = nullptr;
+ if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {
+ ExitVPBB = cast<VPBasicBlock>(
+ BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+ for (VPRecipeBase &R : *ExitVPBB) {
+ createAndCollectMergePhiForReduction(
+ dyn_cast<VPInstruction>(&R), State, OrigLoop,
+ State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+ }
}
// 2.6. Maintain Loop Hints
@@ -7674,6 +7690,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
LoopVectorizeHints Hints(L, true, *ORE);
Hints.setAlreadyVectorized();
}
+
TargetTransformInfo::UnrollingPreferences UP;
TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
@@ -7686,15 +7703,17 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
ILV.printDebugTracesAtEnd();
// 4. Adjust branch weight of the branch in the middle block.
- auto *MiddleTerm =
- cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
- if (MiddleTerm->isConditional() &&
- hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
- // Assume that `Count % VectorTripCount` is equally distributed.
- unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
- assert(TripCount > 0 && "trip count should not be zero");
- const uint32_t Weights[] = {1, TripCount - 1};
- setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+ if (ExitVPBB) {
+ auto *MiddleTerm =
+ cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
+ if (MiddleTerm->isConditional() &&
+ hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+ // Assume that `Count % VectorTripCount` is equally distributed.
+ unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
+ assert(TripCount > 0 && "trip count should not be zero");
+ const uint32_t Weights[] = {1, TripCount - 1};
+ setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+ }
}
return State.ExpandedSCEVs;
@@ -8079,7 +8098,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
// If source is an exiting block, we know the exit edge is dynamically dead
// in the vector loop, and thus we don't need to restrict the mask. Avoid
// adding uses of an otherwise potentially dead instruction.
- if (OrigLoop->isLoopExiting(Src))
+ if (!Legal->canVectorizeMultiCond() && OrigLoop->isLoopExiting(Src))
return EdgeMaskCache[Edge] = SrcMask;
VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8729,6 +8748,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
static SetVector<VPIRInstruction *> collectUsersInExitBlock(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+ if (!Plan.getVectorLoopRegion()->getSingleSuccessor())
+ return {};
auto *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
// No edge from the middle block to the unique exit block has been inserted
@@ -8814,6 +8835,8 @@ static void addLiveOutsForFirstOrderRecurrences(
// TODO: Should be replaced by
// Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
// scalar region is modeled as well.
+ if (!VectorRegion->getSingleSuccessor())
+ return;
auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
@@ -9100,10 +9123,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();
- SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
- addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
- addUsersInExitBlock(*Plan, ExitUsersToFix);
+ if (Legal->canVectorizeMultiCond()) {
+ VPlanTransforms::convertToMultiCond(*Plan, *PSE.getSE(), OrigLoop,
+ RecipeBuilder);
+ } else {
+ SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
+ OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+ addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
+ addUsersInExitBlock(*Plan, ExitUsersToFix);
+ }
// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -9231,8 +9259,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
using namespace VPlanPatternMatch;
VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
- VPBasicBlock *MiddleVPBB =
- cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
for (VPRecipeBase &R : Header->phis()) {
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
@@ -9251,8 +9277,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (VPUser *U : Cur->users()) {
auto *UserRecipe = cast<VPSingleDefRecipe>(U);
if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
- assert(UserRecipe->getParent() == MiddleVPBB &&
- "U must be either in the loop region or the middle block.");
continue;
}
Worklist.insert(UserRecipe);
@@ -9357,6 +9381,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
}
VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
Builder.setInsertPoint(&*LatchVPBB->begin());
+ if (!VectorLoopRegion->getSingleSuccessor())
+ return;
+ VPBasicBlock *MiddleVPBB =
+ cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
for (VPRecipeBase &R :
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 5a8469f800cada..51ea14f14dbb7a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -474,6 +474,14 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
// backedges. A backward successor is set when the branch is created.
const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+ if (TermBr->getSuccessor(idx) &&
+ PredVPBlock == getPlan()->getVectorLoopRegion() &&
+ PredVPBlock->getNumSuccessors()) {
+ // Update PRedBB and TermBr for BranchOnMultiCond in predecessor.
+ PredBB = TermBr->getSuccessor(1);
+ TermBr = cast<BranchInst>(PredBB->getTerminator());
+ idx = 0;
+ }
assert(!TermBr->getSuccessor(idx) &&
"Trying to reset an existing successor block.");
TermBr->setSuccessor(idx, IRBB);
@@ -908,8 +916,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
- VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
if (!RequiresScalarEpilogueCheck) {
+ VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
return Plan;
}
@@ -923,10 +931,14 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
// we unconditionally branch to the scalar preheader. Do nothing.
// 3) Otherwise, construct a runtime check.
BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
- auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
- // The connection order corresponds to the operands of the conditional branch.
- VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
- VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ if (IRExitBlock) {
+ auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+ // The connection order corresponds to the operands of the conditional
+ // branch.
+ VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
+ VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+ VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ }
auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
// Here we use the same DebugLoc as the scalar loop latch terminator instead
@@ -1035,7 +1047,9 @@ void VPlan::execute(VPTransformState *State) {
// VPlan execution rather than earlier during VPlan construction.
BasicBlock *MiddleBB = State->CFG.ExitBB;
VPBasicBlock *MiddleVPBB =
- cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+ getVectorLoopRegion()->getNumSuccessors() == 1
+ ? cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[0])
+ : cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[1]);
// Find the VPBB for the scalar preheader, relying on the current structure
// when creating the middle block and its successrs: if there's a single
// predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1048,6 +1062,10 @@ void VPlan::execute(VPTransformState *State) {
MiddleSuccs.size() == 1 ? MiddleSuccs[0] : MiddleSuccs[1]);
assert(!isa<VPIRBasicBlock>(ScalarPhVPBB) &&
"scalar preheader cannot be wrapped already");
+ if (ScalarPhVPBB->getNumSuccessors() != 0) {
+ ScalarPhVPBB = cast<VPBasicBlock>(ScalarPhVPBB->getSuccessors()[1]);
+ MiddleVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
+ }
replaceVPBBWithIRVPBB(ScalarPhVPBB, ScalarPh);
replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
@@ -1069,6 +1087,10 @@ void VPlan::execute(VPTransformState *State) {
VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
+ if (!getVectorLoopRegion()->getSingleSuccessor())
+ VectorLatchBB =
+ cast<BranchInst>(VectorLatchBB->getTerminator())->getSuccessor(1);
+
// Fix the latch value of canonical, reduction and first-order recurrences
// phis in the vector loop.
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
@@ -1095,7 +1117,10 @@ void VPlan::execute(VPTransformState *State) {
// Move the last step to the end of the latch block. This ensures
// consistent placement of all induction updates.
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
- Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+ if (VectorLatchBB->getTerminator() == &*VectorLatchBB->getFirstNonPHI())
+ Inc->moveBefore(VectorLatchBB->getTerminator());
+ else
+ Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
// Use the steps for the last part as backedge value for the induction.
if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 68a62638b9d588..def7b1eba55824 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1249,6 +1249,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
PtrAdd,
+ AnyOf,
};
private:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 68397d9f8bf5c3..6153e617a0ff52 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -67,6 +67,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
default:
return true;
}
+ case VPExpandSCEVSC:
+ return getParent()->getPlan()->getTripCount() == getVPSingleValue();
case VPInterleaveSC:
return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
case VPWidenStoreEVLSC:
@@ -159,6 +161,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPPredInstPHISC:
case VPScalarCastSC:
return false;
+ case VPExpandSCEVSC:
+ return getParent()->getPlan()->getTripCount() == getVPSingleValue();
case VPInstructionSC:
return mayWriteToMemory();
case VPWidenCallSC: {
@@ -386,12 +390,14 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
return true;
switch (Opcode) {
case Instruction::ICmp:
+ case Instruction::Select:
case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnCount:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::PtrAdd:
case VPInstruction::ExplicitVectorLength:
+ case VPInstruction::AnyOf:
return true;
default:
return false;
@@ -434,9 +440,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
return Builder.CreateCmp(getPredicate(), A, B, Name);
}
case Instruction::Select: {
- Value *Cond = State.get(getOperand(0));
- Value *Op1 = State.get(getOperand(1));
- Value *Op2 = State.get(getOperand(2));
+ bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
+ Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
+ Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
+ Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
return Builder.CreateSelect(Cond, Op1, Op2, Name);
}
case VPInstruction::ActiveLaneMask: {
@@ -666,6 +673,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
}
return NewPhi;
}
+ case VPInstruction::AnyOf: {
+ Value *A = State.get(getOperand(0));
+ return Builder.CreateOrReduce(A);
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
@@ -674,7 +685,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractFromEnd ||
- getOpcode() == VPInstruction::ComputeReductionResult;
+ getOpcode() == VPInstruction::ComputeReductionResult ||
+ getOpcode() == VPInstruction::AnyOf;
}
bool VPInstruction::isSingleScalar() const {
@@ -736,6 +748,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
default:
return false;
case Instruction::ICmp:
+ case Instruction::Select:
+ case Instruction::Or:
case VPInstruction::PtrAdd:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
@@ -831,6 +845,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::PtrAdd:
O << "ptradd";
break;
+ case VPInstruction::AnyOf:
+ O << "any-of";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 379bfc0a4394bf..45bf40aaecc083 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -515,6 +515,12 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
Plan.getEntry());
+ for (VPRecipeBase &R : make_early_inc_range(
+ reverse(*cast<VPBasicBlock>(Plan.getPreheader())))) {
+ if (isDeadRecipe(R))
+ R.eraseFromParent();
+ }
+
for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
// The recipes in the block are processed in reverse order, to catch chains
// of dead recipes.
@@ -1664,3 +1670,79 @@ void VPlanTransforms::createInterleaveGroups(
}
}
}
+
+void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
+ Loop *OrigLoop,
+ VPRecipeBuilder &RecipeBuilder) {
+ auto *LatchVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
+ VPBuilder Builder(LatchVPBB->getTerminator());
+ auto *MiddleVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+
+ const SCEV *BackedgeTakenCount =
+ SE.getExitCount(OrigLoop, OrigLoop->getLoopLatch());
+ const SCEV *TripCount = SE.getTripCountFromExitCount(
+ BackedgeTakenCount, Plan.getCanonicalIV()->getScalarType(), OrigLoop);
+ VPValue *NewTC = vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE);
+ Plan.getTripCount()->replaceAllUsesWith(NewTC);
+ Plan.resetTripCount(NewTC);
+
+ VPValue *EarlyExitTaken = nullptr;
+ SmallVector<BasicBlock *> ExitingBBs;
+ OrigLoop->getExitingBlocks(ExitingBBs);
+ for (BasicBlock *Exiting : ExitingBBs) {
+ auto *ExitingTerm = cast<BranchInst>(Exiting->getTerminator());
+ BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
+ BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
+ VPIRBasicBlock *VPExitBlock;
+ if (OrigLoop->getUniqueExitBlock())
+ VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
+ else
+ VPExitBlock = VPIRBasicBlock::fromBasicBlock(
+ !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+
+ for (VPRecipeBase &R : *VPExitBlock) {
+ auto *ExitIRI = cast<VPIRInstruction>(&R);
+ auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
+ if (!ExitPhi)
+ break;
+ Value *IncomingValue = ExitPhi->getIncomingValueForBlock(Exiting);
+ VPValue *V = RecipeBuilder.getVPValueOrAddLiveIn(IncomingValue);
+ ExitIRI->addOperand(V);
+ }
+
+ if (Exiting == OrigLoop->getLoopLatch()) {
+ if (MiddleVPBB->getNumSuccessors() == 0) {
+ VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+ VPBlockUtils::connectBlocks(MiddleVPBB, VPExitBlock);
+ VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ }
+ continue;
+ }
+
+ VPValue *M = RecipeBuilder.getBlockInMask(
+ OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+ auto *N = Builder.createNot(M);
+ EarlyExitTaken = Builder.createNaryOp(VPInstruction::AnyOf, {N});
+
+ VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split");
+ VPBlockUtils::disconnectBlocks(LoopRegion, MiddleVPBB);
+ VPBlockUtils::insertBlockAfter(NewMiddle, LoopRegion);
+ VPBlockUtils::connectBlocks(NewMiddle, VPExitBlock);
+ VPBlockUtils::connectBlocks(NewMiddle, MiddleVPBB);
+
+ VPBuilder MiddleBuilder(NewMiddle);
+ MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {EarlyExitTaken});
+ // MiddleVPBB = NewMiddle;
+ }
+ auto *Term = dyn_cast<VPInstruction>(LatchVPBB->getTerminator());
+ auto *IsLatchExiting = Builder.createICmp(
+ CmpInst::ICMP_EQ, Term->getOperand(0), Term->getOperand(1));
+ auto *AnyExiting =
+ Builder.createNaryOp(Instruction::Or, {EarlyExitTaken, IsLatchExiting});
+ Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExiting);
+ Term->eraseFromParent();
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 3b792ee32dce6e..dd043ae1706517 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -121,6 +121,10 @@ struct VPlanTransforms {
/// Remove dead recipes from \p Plan.
static void removeDeadRecipes(VPlan &Plan);
+
+ static void convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
+ Loop *OrigLoop,
+ VPRecipeBuilder &RecipeBuilder);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 99bc4c38a3c3cd..d3694d1a2fc129 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -244,14 +244,6 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
return false;
}
- VPBlockBase *MiddleBB =
- IRBB->getPlan()->getVectorLoopRegion()->getSingleSuccessor();
- if (IRBB != IRBB->getPlan()->getPreheader() &&
- IRBB->getSinglePredecessor() != MiddleBB) {
- errs() << "VPIRBasicBlock can only be used as pre-header or a successor of "
- "middle-block at the moment!\n";
- return false;
- }
return true;
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
new file mode 100644
index 00000000000000..bdc8bb07df457f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-multi-cond-vectorization %s | FileCheck --check-prefix=MULTI %s
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-multi-cond-vectorization=false %s | FileCheck --check-prefix=DEFAULT %s
+
+define i64 @multi_exit_with_store(ptr %p, i64 %N) {
+; MULTI-LABEL: define i64 @multi_exit_with_store(
+; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; MULTI-NEXT: [[ENTRY:.*]]:
+; MULTI-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; MULTI: [[VECTOR_PH]]:
+; MULTI-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; MULTI-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; MULTI-NEXT: br label %[[VECTOR_BODY:.*]]
+; MULTI: [[VECTOR_BODY]]:
+; MULTI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; MULTI-NEXT: [[TMP1:%.*]] = icmp uge <8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; MULTI-NEXT: [[TMP2:%.*]] = xor <8 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
+; MULTI-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
+; MULTI-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP8]], i32 4, <8 x i1> [[TMP2]])
+; MULTI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; MULTI-NEXT: [[TMP5:%.*]] = xor <8 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]])
+; MULTI-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; MULTI-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; MULTI-NEXT: [[TMP9:%.*]] = or i1 [[TMP6]], [[TMP15]]
+; MULTI-NEXT: br i1 [[TMP9]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; MULTI: [[MIDDLE_SPLIT]]:
+; MULTI-NEXT: br i1 [[TMP6]], label %[[E1:.*]], label %[[MIDDLE_BLOCK:.*]]
+; MULTI: [[MIDDLE_BLOCK]]:
+; MULTI-NEXT: br i1 true, label %[[E2:.*]], label %[[SCALAR_PH]]
+; MULTI: [[SCALAR_PH]]:
+; MULTI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MULTI-NEXT: br label %[[LOOP_HEADER:.*]]
+; MULTI: [[LOOP_HEADER]]:
+; MULTI-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; MULTI-NEXT: [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
+; MULTI-NEXT: br i1 [[CMP1]], label %[[E1]], label %[[LOOP_LATCH]]
+; MULTI: [[LOOP_LATCH]]:
+; MULTI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
+; MULTI-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+; MULTI-NEXT: [[INC]] = add nuw i64 [[I_07]], 1
+; MULTI-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
+; MULTI-NEXT: br i1 [[CMP_NOT]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; MULTI: [[E1]]:
+; MULTI-NEXT: ret i64 0
+; MULTI: [[E2]]:
+; MULTI-NEXT: ret i64 1
+;
+; DEFAULT-LABEL: define i64 @multi_exit_with_store(
+; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT: [[ENTRY:.*]]:
+; DEFAULT-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
+; DEFAULT-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 8
+; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT: [[VECTOR_PH]]:
+; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 8
+; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; DEFAULT-NEXT: [[TMP2:%.*]] = select i1 [[TMP5]], i64 8, i64 [[N_MOD_VF]]
+; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP2]]
+; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
+; DEFAULT: [[VECTOR_BODY]]:
+; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]]
+; DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; DEFAULT-NEXT: store <8 x i32> zeroinitializer, ptr [[TMP3]], align 4
+; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT: [[MIDDLE_BLOCK]]:
+; DEFAULT-NEXT: br label %[[SCALAR_PH]]
+; DEFAULT: [[SCALAR_PH]]:
+; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]]
+; DEFAULT: [[LOOP_HEADER]]:
+; DEFAULT-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; DEFAULT-NEXT: [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
+; DEFAULT-NEXT: br i1 [[CMP1]], label %[[E1:.*]], label %[[LOOP_LATCH]]
+; DEFAULT: [[LOOP_LATCH]]:
+; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
+; DEFAULT-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+; DEFAULT-NEXT: [[INC]] = add nuw i64 [[I_07]], 1
+; DEFAULT-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
+; DEFAULT-NEXT: br i1 [[CMP_NOT]], label %[[E2:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT: [[E1]]:
+; DEFAULT-NEXT: ret i64 0
+; DEFAULT: [[E2]]:
+; DEFAULT-NEXT: ret i64 1
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %c.1 = icmp uge i64 %iv, %N
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+ store i32 0, ptr %arrayidx
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e2, label %loop.header
+
+e1:
+ ret i64 0
+
+e2:
+ ret i64 1
+
+}
+;.
+; MULTI: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; MULTI: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; MULTI: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; MULTI: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
index cd128979fc1431..1c02f10753745c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
@@ -5,18 +5,18 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64
; CHECK-LABEL: define i64 @test_value_in_exit_compare_chain_used_outside(
; CHECK-SAME: ptr [[SRC:%.*]], i64 [[X:%.*]], i64 range(i64 1, 32) [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[N]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]]
-; CHECK-NEXT: [[UMIN2:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[X]])
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[UMIN2]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
-; CHECK: [[VECTOR_SCEVCHECK]]:
; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[N]], -1
; CHECK-NEXT: [[TMP4:%.*]] = freeze i64 [[TMP3]]
; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[X]])
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[UMIN]] to i1
-; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[UMIN]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK: [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT: [[TMP32:%.*]] = add nsw i64 [[N]], -1
+; CHECK-NEXT: [[TMP33:%.*]] = freeze i64 [[TMP32]]
+; CHECK-NEXT: [[UMIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP33]], i64 [[X]])
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[UMIN1]] to i1
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[UMIN1]], 1
; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
; CHECK-NEXT: br i1 [[TMP7]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
new file mode 100644
index 00000000000000..151c46123f6f0d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-multi-cond-vectorization -debug %s 2>&1 | FileCheck %s
+
+define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={2,4,8},UF={1}' {
+; CHECK-NEXT: Live-in vp<%0> = VF
+; CHECK-NEXT: Live-in vp<%1> = VF * UF
+; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%7>
+; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<%0>
+; CHECK-NEXT: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT: EMIT vp<%5> = not ir<%c.1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<%4>
+; CHECK-NEXT: vp<%6> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT: WIDEN store vp<%6>, ir<0>, vp<%5>
+; CHECK-NEXT: EMIT vp<%7> = add nuw vp<%3>, vp<%1>
+; CHECK-NEXT: EMIT vp<%8> = not vp<%5>
+; CHECK-NEXT: EMIT vp<%9> = any-of vp<%8>
+; CHECK-NEXT: EMIT vp<%10> = icmp eq vp<%7>, vp<%2>
+; CHECK-NEXT: EMIT vp<%11> = or vp<%9>, vp<%10>
+; CHECK-NEXT: EMIT branch-on-cond vp<%11>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT: EMIT branch-on-cond vp<%9>
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<%14> = icmp eq ir<128>, vp<%2>
+; CHECK-NEXT: EMIT branch-on-cond vp<%14>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %c.1 = icmp uge i64 %iv, %N
+ br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+ %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+ store i32 0, ptr %arrayidx
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e2, label %loop.header
+
+e1:
+ %p1 = phi i64 [ 0, %loop.header ]
+ ret i64 %p1
+
+e2:
+ %p2 = phi i64 [ 1, %loop.latch ]
+ ret i64 %p2
+}
+
+define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={2,4,8},UF={1}' {
+; CHECK-NEXT: Live-in vp<%0> = VF
+; CHECK-NEXT: Live-in vp<%1> = VF * UF
+; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%7>
+; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<%0>
+; CHECK-NEXT: vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT: EMIT vp<%5> = not ir<%c.1>
+; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<%4>
+; CHECK-NEXT: vp<%6> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT: WIDEN store vp<%6>, ir<0>, vp<%5>
+; CHECK-NEXT: EMIT vp<%7> = add nuw vp<%3>, vp<%1>
+; CHECK-NEXT: EMIT vp<%8> = not vp<%5>
+; CHECK-NEXT: EMIT vp<%9> = any-of vp<%8>
+; CHECK-NEXT: EMIT vp<%10> = icmp eq vp<%7>, vp<%2>
+; CHECK-NEXT: EMIT vp<%11> = or vp<%9>, vp<%10>
+; CHECK-NEXT: EMIT branch-on-cond vp<%11>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT: EMIT branch-on-cond vp<%9>
+; CHECK-NEXT: Successor(s): ir-bb<e>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e>:
+; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<0>, ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: EMIT vp<%14> = icmp eq ir<128>, vp<%2>
+; CHECK-NEXT: EMIT branch-on-cond vp<%14>
+; CHECK-NEXT: Successor(s): ir-bb<e>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+ %c.1 = icmp uge i64 %iv, %N
+ br i1 %c.1, label %e, label %loop.latch
+
+loop.latch:
+ %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+ store i32 0, ptr %arrayidx
+ %inc = add nuw i64 %iv, 1
+ %c.2 = icmp eq i64 %inc, 128
+ br i1 %c.2, label %e, label %loop.header
+
+e:
+ %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
+ ret i64 %p1
+}
More information about the llvm-commits
mailing list