[llvm] [VPlan] Introduce multi-branch recipe, use for multi-exit loops (WIP). (PR #109193)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Sun Oct 13 06:54:39 PDT 2024


https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/109193

>From 79d5c6aa3d013ec31d193abc8c95c557baba78ee Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 1 Oct 2024 20:57:24 +0100
Subject: [PATCH 1/2] [VPlan] Support VPIRBBs and VPIRInst phis with multiple
 predecessors.

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp        | 5 ++++-
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 9 +++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 5e3a6388094940..5a8469f800cada 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1060,7 +1060,10 @@ void VPlan::execute(VPTransformState *State) {
   State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}});
 
   // Generate code in the loop pre-header and body.
-  for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      Entry);
+
+  for (VPBlockBase *Block : RPOT)
     Block->execute(State);
 
   VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2948ecc580edc0..68397d9f8bf5c3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -848,12 +848,13 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
 void VPIRInstruction::execute(VPTransformState &State) {
   assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
          "Only PHINodes can have extra operands");
-  if (getNumOperands() == 1) {
-    VPValue *ExitValue = getOperand(0);
+  for (const auto &[Idx, Op] : enumerate(operands())) {
+    VPValue *ExitValue = Op;
     auto Lane = vputils::isUniformAfterVectorization(ExitValue)
                     ? VPLane::getFirstLane()
                     : VPLane::getLastLaneForVF(State.VF);
-    auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
+    VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
+    auto *PredVPBB = Pred->getExitingBasicBlock();
     BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
     // Set insertion point in PredBB in case an extract needs to be generated.
     // TODO: Model extracts explicitly.
@@ -881,7 +882,7 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
   O << Indent << "IR " << I;
 
   if (getNumOperands() != 0) {
-    assert(getNumOperands() == 1 && "can have at most 1 operand");
+    // assert(getNumOperands() == 1 && "can have at most 1 operand");
     O << " (extra operand: ";
     printOperands(O, SlotTracker);
     O << ")";

>From 6f31e57917aab09e891554614e42d6f0132a3fa3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 18 Sep 2024 21:35:57 +0100
Subject: [PATCH 2/2] [VPlan] Introduce multi-branch recipe, use for multi-exit
 loops (WIP).

This patch introduces a new BranchMultipleConds VPInstruction that
takes multiple conditions and branches to the first successor if the
first operand is true, to the second successor if the second condition
is true and to the region header if neither is true. At the moment it
only supports 2 conditions, but it can be extended in the future.

This may serve as an alternative to changing VPRegionBlock to allow
multiple exiting blocks and keep it single-entry-single-exit. With
BranchMultipleConds, we still leave a region via a single exiting
block, but can have more than 2 destinations (similar idea to switch in
LLVM IR). The new recipe allows to precisely model edges and conditions
leaving the vector loop region.

BranchMultipleConds also allows predicating instructions in blocks
after any early exit, i.e. also allows later stores.

See llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll for
an example VPlan and llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
for example predicated codegen.

The patch also contains logic to construct VPlans using
BranchMultipleConds for simple loops with 2 exit blocks instead of
requiring a scalar tail. To logic to detect such cases is a bit rough
around the edges and mainly to test the new recipes end-to-end.

This may serve as an alternative to https://github.com/llvm/llvm-project/pull/108563
that would allow us to keep the single-entry-single-exit property and
support predication between early exits and latches.
---
 .../Vectorize/LoopVectorizationLegality.h     |   2 +
 .../Vectorize/LoopVectorizationLegality.cpp   |  34 +++++
 .../Transforms/Vectorize/LoopVectorize.cpp    |  82 +++++++----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  43 ++++--
 llvm/lib/Transforms/Vectorize/VPlan.h         |   3 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  58 +++++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  74 ++++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |   4 +
 .../Transforms/Vectorize/VPlanVerifier.cpp    |  20 +--
 .../LoopVectorize/X86/multi-exit-codegen.ll   | 124 ++++++++++++++++
 .../LoopVectorize/X86/multi-exit-cost.ll      |  18 +--
 .../LoopVectorize/X86/multi-exit-vplan.ll     | 134 ++++++++++++++++++
 12 files changed, 533 insertions(+), 63 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index dc7e484a40a452..19fa7b8ae9d6fe 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -287,6 +287,8 @@ class LoopVectorizationLegality {
   /// we can use in-order reductions.
   bool canVectorizeFPMath(bool EnableStrictReductions);
 
+  bool canVectorizeMultiCond() const;
+
   /// Return true if we can vectorize this loop while folding its tail by
   /// masking.
   bool canFoldTailByMasking() const;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 43be72f0f34d45..4e76e11d80a723 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -43,6 +43,9 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
                        cl::desc("Enable recognition of non-constant strided "
                                 "pointer induction variables."));
 
+static cl::opt<bool> EnableMultiCond("enable-multi-cond-vectorization",
+                                     cl::init(false), cl::Hidden, cl::desc(""));
+
 namespace llvm {
 cl::opt<bool>
     HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
@@ -1378,6 +1381,8 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
 }
 
 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
+  if (canVectorizeMultiCond() && BB != TheLoop->getHeader())
+    return true;
   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
 }
 
@@ -1514,6 +1519,35 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   return true;
 }
 
+bool LoopVectorizationLegality::canVectorizeMultiCond() const {
+  if (!EnableMultiCond)
+    return false;
+  SmallVector<BasicBlock *> Exiting;
+  TheLoop->getExitingBlocks(Exiting);
+  if (Exiting.size() != 2 || Exiting[0] != TheLoop->getHeader() ||
+      Exiting[1] != TheLoop->getLoopLatch() ||
+      any_of(*TheLoop->getHeader(), [](Instruction &I) {
+        return I.mayReadFromMemory() || I.mayHaveSideEffects();
+      }))
+    return false;
+  CmpInst::Predicate Pred;
+  Value *A, *B;
+  if (!match(
+          TheLoop->getHeader()->getTerminator(),
+          m_Br(m_ICmp(Pred, m_Value(A), m_Value(B)), m_Value(), m_Value())) ||
+      Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE)
+    return false;
+  if (any_of(TheLoop->getBlocks(), [this](BasicBlock *BB) {
+        return any_of(*BB, [this](Instruction &I) {
+          return any_of(I.users(), [this](User *U) {
+            return !TheLoop->contains(cast<Instruction>(U)->getParent());
+          });
+        });
+      }))
+    return false;
+  return true;
+}
+
 // Helper function to canVectorizeLoopNestCFG.
 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
                                                     bool UseVPlanNativePath) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 027ee21527d228..7aff74194b703e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1362,9 +1362,11 @@ class LoopVectorizationCostModel {
     // If we might exit from anywhere but the latch, must run the exiting
     // iteration in scalar form.
     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-      LLVM_DEBUG(
-          dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
-      return true;
+      if (!Legal->canVectorizeMultiCond()) {
+        LLVM_DEBUG(
+            dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+        return true;
+      }
     }
     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
@@ -2535,8 +2537,17 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   assert(LoopVectorPreHeader && "Invalid loop structure");
   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
-  assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
-         "multiple exit loop without required epilogue?");
+  if (Legal->canVectorizeMultiCond()) {
+    BasicBlock *Latch = OrigLoop->getLoopLatch();
+    BasicBlock *TrueSucc =
+        cast<BranchInst>(Latch->getTerminator())->getSuccessor(0);
+    BasicBlock *FalseSucc =
+        cast<BranchInst>(Latch->getTerminator())->getSuccessor(1);
+    LoopExitBlock = OrigLoop->contains(TrueSucc) ? FalseSucc : TrueSucc;
+  } else {
+    assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
+           "multiple exit loop without required epilogue?");
+  }
 
   LoopMiddleBlock =
       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
@@ -2910,7 +2921,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
     for (PHINode &PN : Exit->phis())
       PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
 
-  if (Cost->requiresScalarEpilogue(VF.isVector())) {
+  if (Legal->canVectorizeMultiCond() ||
+      Cost->requiresScalarEpilogue(VF.isVector())) {
     // No edge from the middle block to the unique exit block has been inserted
     // and there is nothing to fix from vector loop; phis should have incoming
     // from scalar loop only.
@@ -3554,7 +3566,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   TheLoop->getExitingBlocks(Exiting);
   for (BasicBlock *E : Exiting) {
     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
-    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() &&
+        (TheLoop->getLoopLatch() == E || !Legal->canVectorizeMultiCond()))
       AddToWorklistIfAllowed(Cmp);
   }
 
@@ -7643,12 +7656,15 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   BestVPlan.execute(&State);
 
   // 2.5 Collect reduction resume values.
-  auto *ExitVPBB =
-      cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
-  for (VPRecipeBase &R : *ExitVPBB) {
-    createAndCollectMergePhiForReduction(
-        dyn_cast<VPInstruction>(&R), State, OrigLoop,
-        State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+  VPBasicBlock *ExitVPBB = nullptr;
+  if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {
+    ExitVPBB = cast<VPBasicBlock>(
+        BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+    for (VPRecipeBase &R : *ExitVPBB) {
+      createAndCollectMergePhiForReduction(
+          dyn_cast<VPInstruction>(&R), State, OrigLoop,
+          State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+    }
   }
 
   // 2.6. Maintain Loop Hints
@@ -7674,6 +7690,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     LoopVectorizeHints Hints(L, true, *ORE);
     Hints.setAlreadyVectorized();
   }
+
   TargetTransformInfo::UnrollingPreferences UP;
   TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
   if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
@@ -7686,15 +7703,17 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   ILV.printDebugTracesAtEnd();
 
   // 4. Adjust branch weight of the branch in the middle block.
-  auto *MiddleTerm =
-      cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
-  if (MiddleTerm->isConditional() &&
-      hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
-    // Assume that `Count % VectorTripCount` is equally distributed.
-    unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
-    assert(TripCount > 0 && "trip count should not be zero");
-    const uint32_t Weights[] = {1, TripCount - 1};
-    setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+  if (ExitVPBB) {
+    auto *MiddleTerm =
+        cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
+    if (MiddleTerm->isConditional() &&
+        hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+      // Assume that `Count % VectorTripCount` is equally distributed.
+      unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
+      assert(TripCount > 0 && "trip count should not be zero");
+      const uint32_t Weights[] = {1, TripCount - 1};
+      setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+    }
   }
 
   return State.ExpandedSCEVs;
@@ -8079,7 +8098,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   // If source is an exiting block, we know the exit edge is dynamically dead
   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
   // adding uses of an otherwise potentially dead instruction.
-  if (OrigLoop->isLoopExiting(Src))
+  if (!Legal->canVectorizeMultiCond() && OrigLoop->isLoopExiting(Src))
     return EdgeMaskCache[Edge] = SrcMask;
 
   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8729,6 +8748,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 static SetVector<VPIRInstruction *> collectUsersInExitBlock(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+  if (!Plan.getVectorLoopRegion()->getSingleSuccessor())
+    return {};
   auto *MiddleVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
   // No edge from the middle block to the unique exit block has been inserted
@@ -8814,6 +8835,8 @@ static void addLiveOutsForFirstOrderRecurrences(
   // TODO: Should be replaced by
   // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
   // scalar region is modeled as well.
+  if (!VectorRegion->getSingleSuccessor())
+    return;
   auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
   VPBasicBlock *ScalarPHVPBB = nullptr;
   if (MiddleVPBB->getNumSuccessors() == 2) {
@@ -9100,6 +9123,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
          "VPBasicBlock");
   RecipeBuilder.fixHeaderPhis();
 
+  if (Legal->canVectorizeMultiCond()) {
+    VPlanTransforms::convertToMultiCond(*Plan, *PSE.getSE(), OrigLoop,
+                                        RecipeBuilder);
+  }
+
   SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
       OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
   addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
@@ -9231,8 +9259,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   using namespace VPlanPatternMatch;
   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
-  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
   for (VPRecipeBase &R : Header->phis()) {
     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
@@ -9251,8 +9277,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       for (VPUser *U : Cur->users()) {
         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
-          assert(UserRecipe->getParent() == MiddleVPBB &&
-                 "U must be either in the loop region or the middle block.");
           continue;
         }
         Worklist.insert(UserRecipe);
@@ -9357,6 +9381,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   }
   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
   Builder.setInsertPoint(&*LatchVPBB->begin());
+  if (!VectorLoopRegion->getSingleSuccessor())
+    return;
+  VPBasicBlock *MiddleVPBB =
+      cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
   for (VPRecipeBase &R :
        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 5a8469f800cada..cd6a326ed70c76 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -474,6 +474,14 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
     // backedges. A backward successor is set when the branch is created.
     const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
     unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+    if (TermBr->getSuccessor(idx) &&
+        PredVPBlock == getPlan()->getVectorLoopRegion() &&
+        PredVPBlock->getNumSuccessors()) {
+      // Update PRedBB and TermBr for BranchOnMultiCond in predecessor.
+      PredBB = TermBr->getSuccessor(1);
+      TermBr = cast<BranchInst>(PredBB->getTerminator());
+      idx = 0;
+    }
     assert(!TermBr->getSuccessor(idx) &&
            "Trying to reset an existing successor block.");
     TermBr->setSuccessor(idx, IRBB);
@@ -601,9 +609,11 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
   }
 
   const VPRecipeBase *R = &VPBB->back();
-  bool IsCondBranch = isa<VPBranchOnMaskRecipe>(R) ||
-                      match(R, m_BranchOnCond(m_VPValue())) ||
-                      match(R, m_BranchOnCount(m_VPValue(), m_VPValue()));
+  bool IsCondBranch =
+      isa<VPBranchOnMaskRecipe>(R) || match(R, m_BranchOnCond(m_VPValue())) ||
+      match(R, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
+      (isa<VPInstruction>(R) && cast<VPInstruction>(R)->getOpcode() ==
+                                    VPInstruction::BranchMultipleConds);
   (void)IsCondBranch;
 
   if (VPBB->getNumSuccessors() >= 2 ||
@@ -908,8 +918,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
 
-  VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
   if (!RequiresScalarEpilogueCheck) {
+    VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
     VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
     return Plan;
   }
@@ -923,10 +933,14 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
   BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
-  auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
-  // The connection order corresponds to the operands of the conditional branch.
-  VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
-  VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+  if (IRExitBlock) {
+    auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+    // The connection order corresponds to the operands of the conditional
+    // branch.
+    VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
+    VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+    VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+  }
 
   auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
   // Here we use the same DebugLoc as the scalar loop latch terminator instead
@@ -1035,7 +1049,9 @@ void VPlan::execute(VPTransformState *State) {
   // VPlan execution rather than earlier during VPlan construction.
   BasicBlock *MiddleBB = State->CFG.ExitBB;
   VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+      getVectorLoopRegion()->getNumSuccessors() == 1
+          ? cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[0])
+          : cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[1]);
   // Find the VPBB for the scalar preheader, relying on the current structure
   // when creating the middle block and its successrs: if there's a single
   // predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1069,6 +1085,10 @@ void VPlan::execute(VPTransformState *State) {
   VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
   BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
 
+  if (!getVectorLoopRegion()->getSingleSuccessor())
+    VectorLatchBB =
+        cast<BranchInst>(VectorLatchBB->getTerminator())->getSuccessor(1);
+
   // Fix the latch value of canonical, reduction and first-order recurrences
   // phis in the vector loop.
   VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
@@ -1095,7 +1115,10 @@ void VPlan::execute(VPTransformState *State) {
       // Move the last step to the end of the latch block. This ensures
       // consistent placement of all induction updates.
       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
-      Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+      if (VectorLatchBB->getTerminator() == &*VectorLatchBB->getFirstNonPHI())
+        Inc->moveBefore(VectorLatchBB->getTerminator());
+      else
+        Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
 
       // Use the steps for the last part as backedge value for the induction.
       if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 68a62638b9d588..08c6a868e9bfcc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1239,6 +1239,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
     CanonicalIVIncrementForPart,
     BranchOnCount,
     BranchOnCond,
+    BranchMultipleConds,
     ComputeReductionResult,
     // Takes the VPValue to extract from as first operand and the lane or part
     // to extract as second operand, counting from the end starting with 1 for
@@ -1249,6 +1250,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
     PtrAdd,
+    AnyOf,
   };
 
 private:
@@ -1370,6 +1372,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
     case Instruction::AtomicRMW:
     case VPInstruction::BranchOnCond:
     case VPInstruction::BranchOnCount:
+    case VPInstruction::BranchMultipleConds:
       return false;
     default:
       return true;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 68397d9f8bf5c3..193d735b676b41 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -67,6 +67,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
     default:
       return true;
     }
+  case VPExpandSCEVSC:
+    return getParent()->getPlan()->getTripCount() == getVPSingleValue();
   case VPInterleaveSC:
     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
@@ -159,6 +161,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPPredInstPHISC:
   case VPScalarCastSC:
     return false;
+  case VPExpandSCEVSC:
+    return getParent()->getPlan()->getTripCount() == getVPSingleValue();
   case VPInstructionSC:
     return mayWriteToMemory();
   case VPWidenCallSC: {
@@ -386,12 +390,14 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
     return true;
   switch (Opcode) {
   case Instruction::ICmp:
+  case Instruction::Select:
   case VPInstruction::BranchOnCond:
   case VPInstruction::BranchOnCount:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
+  case VPInstruction::AnyOf:
     return true;
   default:
     return false;
@@ -434,9 +440,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
     return Builder.CreateCmp(getPredicate(), A, B, Name);
   }
   case Instruction::Select: {
-    Value *Cond = State.get(getOperand(0));
-    Value *Op1 = State.get(getOperand(1));
-    Value *Op2 = State.get(getOperand(2));
+    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
+    Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
+    Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
+    Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
     return Builder.CreateSelect(Cond, Op1, Op2, Name);
   }
   case VPInstruction::ActiveLaneMask: {
@@ -529,6 +536,34 @@ Value *VPInstruction::generate(VPTransformState &State) {
     CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
     return CondBr;
   }
+  case VPInstruction::BranchMultipleConds: {
+    assert(getNumOperands() == 2 && "Must have exactly 2 conditions");
+    assert(getParent()->isExiting() && "Must be placed in exiting block");
+    assert(getParent()->getParent()->getNumSuccessors() == 2 &&
+           "Must have exactly 2 successors");
+
+    VPRegionBlock *ParentRegion = getParent()->getParent();
+    VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
+    Value *Cond1 = State.get(getOperand(0), /*IsScalar*/ true);
+    Value *Cond2 = State.get(getOperand(1), /*IsScalar*/ true);
+    BasicBlock *BB = Builder.GetInsertBlock();
+    BasicBlock *BB2 =
+        BB->splitBasicBlock(BB->getTerminator(), BB->getName() + ".split");
+
+    Builder.SetInsertPoint(BB->getTerminator());
+    BranchInst *CondBr1 = Builder.CreateCondBr(Cond1, BB, BB2);
+
+    Builder.SetInsertPoint(BB2->getTerminator());
+    BranchInst *CondBr2 = Builder.CreateCondBr(Cond2, BB2, nullptr);
+    CondBr2->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
+    CondBr1->setSuccessor(0, nullptr);
+    CondBr2->setSuccessor(0, nullptr);
+    BB->getTerminator()->eraseFromParent();
+    BB2->getTerminator()->eraseFromParent();
+    State.CFG.PrevBB = BB2;
+    return CondBr2;
+  }
+
   case VPInstruction::BranchOnCount: {
     // First create the compare.
     Value *IV = State.get(getOperand(0), /*IsScalar*/ true);
@@ -666,6 +701,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
     }
     return NewPhi;
   }
+  case VPInstruction::AnyOf: {
+    Value *A = State.get(getOperand(0));
+    return Builder.CreateOrReduce(A);
+  }
 
   default:
     llvm_unreachable("Unsupported opcode for instruction");
@@ -674,7 +713,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
-         getOpcode() == VPInstruction::ComputeReductionResult;
+         getOpcode() == VPInstruction::ComputeReductionResult ||
+         getOpcode() == VPInstruction::AnyOf;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -736,6 +776,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   default:
     return false;
   case Instruction::ICmp:
+  case Instruction::Select:
+  case Instruction::Or:
   case VPInstruction::PtrAdd:
     // TODO: Cover additional opcodes.
     return vputils::onlyFirstLaneUsed(this);
@@ -746,6 +788,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   case VPInstruction::BranchOnCount:
   case VPInstruction::BranchOnCond:
   case VPInstruction::ResumePhi:
+  case VPInstruction::BranchMultipleConds:
     return true;
   };
   llvm_unreachable("switch should return");
@@ -764,6 +807,7 @@ bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
     return vputils::onlyFirstPartUsed(this);
   case VPInstruction::BranchOnCount:
   case VPInstruction::BranchOnCond:
+  case VPInstruction::BranchMultipleConds:
   case VPInstruction::CanonicalIVIncrementForPart:
     return true;
   };
@@ -810,6 +854,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::BranchOnCond:
     O << "branch-on-cond";
     break;
+  case VPInstruction::BranchMultipleConds:
+    O << "branch-on-multi-cond";
+    break;
   case VPInstruction::CalculateTripCountMinusVF:
     O << "TC > VF ? TC - VF : 0";
     break;
@@ -831,6 +878,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::PtrAdd:
     O << "ptradd";
     break;
+  case VPInstruction::AnyOf:
+    O << "any-of";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 379bfc0a4394bf..145b8c418a37be 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -515,6 +515,12 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
       Plan.getEntry());
 
+  for (VPRecipeBase &R : make_early_inc_range(
+           reverse(*cast<VPBasicBlock>(Plan.getPreheader())))) {
+    if (isDeadRecipe(R))
+      R.eraseFromParent();
+  }
+
   for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
     // The recipes in the block are processed in reverse order, to catch chains
     // of dead recipes.
@@ -1664,3 +1670,71 @@ void VPlanTransforms::createInterleaveGroups(
       }
   }
 }
+
+void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
+                                         Loop *OrigLoop,
+                                         VPRecipeBuilder &RecipeBuilder) {
+  auto *LatchVPBB =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
+  VPBuilder Builder(LatchVPBB->getTerminator());
+  auto *MiddleVPBB =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  VPBlockUtils::disconnectBlocks(LoopRegion, MiddleVPBB);
+
+  const SCEV *BackedgeTakenCount =
+      SE.getExitCount(OrigLoop, OrigLoop->getLoopLatch());
+  const SCEV *TripCount = SE.getTripCountFromExitCount(
+      BackedgeTakenCount, Plan.getCanonicalIV()->getScalarType(), OrigLoop);
+  VPValue *NewTC = vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE);
+  Plan.getTripCount()->replaceAllUsesWith(NewTC);
+  Plan.resetTripCount(NewTC);
+
+  VPValue *EarlyExitTaken = nullptr;
+  SmallVector<BasicBlock *> ExitingBBs;
+  OrigLoop->getExitingBlocks(ExitingBBs);
+  for (BasicBlock *Exiting : ExitingBBs) {
+    auto *ExitingTerm = cast<BranchInst>(Exiting->getTerminator());
+    BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
+    BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
+    VPIRBasicBlock *VPExitBlock;
+    if (OrigLoop->getUniqueExitBlock())
+      VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
+    else
+      VPExitBlock = VPIRBasicBlock::fromBasicBlock(
+          !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+
+    for (VPRecipeBase &R : *VPExitBlock) {
+      auto *ExitIRI = cast<VPIRInstruction>(&R);
+      auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
+      if (!ExitPhi)
+        break;
+      Value *IncomingValue = ExitPhi->getIncomingValueForBlock(Exiting);
+      VPValue *V = RecipeBuilder.getVPValueOrAddLiveIn(IncomingValue);
+      ExitIRI->addOperand(V);
+    }
+
+    if (Exiting == OrigLoop->getLoopLatch()) {
+      if (MiddleVPBB->getNumSuccessors() == 0) {
+        VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+        VPBlockUtils::connectBlocks(MiddleVPBB, VPExitBlock);
+        VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+      }
+      continue;
+    }
+
+    VPValue *M = RecipeBuilder.getBlockInMask(
+        OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+    auto *N = Builder.createNot(M);
+    EarlyExitTaken = Builder.createNaryOp(VPInstruction::AnyOf, {N});
+    VPBlockUtils::connectBlocks(LoopRegion, VPExitBlock);
+  }
+  auto *Term = dyn_cast<VPInstruction>(LatchVPBB->getTerminator());
+  auto *IsLatchExiting = Builder.createICmp(
+      CmpInst::ICMP_EQ, Term->getOperand(0), Term->getOperand(1));
+  Builder.createNaryOp(VPInstruction::BranchMultipleConds,
+                       {EarlyExitTaken, IsLatchExiting});
+  Term->eraseFromParent();
+  VPBlockUtils::connectBlocks(LoopRegion, MiddleVPBB);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 3b792ee32dce6e..dd043ae1706517 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -121,6 +121,10 @@ struct VPlanTransforms {
 
   /// Remove dead recipes from \p Plan.
   static void removeDeadRecipes(VPlan &Plan);
+
+  static void convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
+                                 Loop *OrigLoop,
+                                 VPRecipeBuilder &RecipeBuilder);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 99bc4c38a3c3cd..da8d0f26ba40a4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -244,14 +244,6 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
     return false;
   }
 
-  VPBlockBase *MiddleBB =
-      IRBB->getPlan()->getVectorLoopRegion()->getSingleSuccessor();
-  if (IRBB != IRBB->getPlan()->getPreheader() &&
-      IRBB->getSinglePredecessor() != MiddleBB) {
-    errs() << "VPIRBasicBlock can only be used as pre-header or a successor of "
-              "middle-block at the moment!\n";
-    return false;
-  }
   return true;
 }
 
@@ -269,9 +261,9 @@ static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) {
 bool VPlanVerifier::verifyBlock(const VPBlockBase *VPB) {
   auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
   // Check block's condition bit.
-  if (VPB->getNumSuccessors() > 1 ||
-      (VPBB && VPBB->getParent() && VPBB->isExiting() &&
-       !VPBB->getParent()->isReplicator())) {
+  if (VPBB && (VPB->getNumSuccessors() > 1 ||
+               (VPBB && VPBB->getParent() && VPBB->isExiting() &&
+                !VPBB->getParent()->isReplicator()))) {
     if (!VPBB || !VPBB->getTerminator()) {
       errs() << "Block has multiple successors but doesn't "
                 "have a proper branch recipe!\n";
@@ -409,8 +401,10 @@ bool VPlanVerifier::verify(const VPlan &Plan) {
   }
 
   auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exiting->end()));
-  if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount &&
-                    LastInst->getOpcode() != VPInstruction::BranchOnCond)) {
+  if (!LastInst ||
+      (LastInst->getOpcode() != VPInstruction::BranchOnCount &&
+       LastInst->getOpcode() != VPInstruction::BranchOnCond &&
+       LastInst->getOpcode() != VPInstruction::BranchMultipleConds)) {
     errs() << "VPlan vector loop exit must end with BranchOnCount or "
               "BranchOnCond VPInstruction\n";
     return false;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
new file mode 100644
index 00000000000000..e93bfda04b11a0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-multi-cond-vectorization %s | FileCheck --check-prefix=MULTI %s
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-multi-cond-vectorization=false %s | FileCheck --check-prefix=DEFAULT %s
+
+define i64 @multi_exit_with_store(ptr %p, i64 %N) {
+; MULTI-LABEL: define i64 @multi_exit_with_store(
+; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; MULTI-NEXT:  [[ENTRY:.*]]:
+; MULTI-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; MULTI:       [[VECTOR_PH]]:
+; MULTI-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; MULTI-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; MULTI-NEXT:    br label %[[VECTOR_BODY:.*]]
+; MULTI:       [[VECTOR_BODY]]:
+; MULTI-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY_SPLIT:.*]] ]
+; MULTI-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY_SPLIT]] ]
+; MULTI-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; MULTI-NEXT:    [[TMP1:%.*]] = icmp uge <8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; MULTI-NEXT:    [[TMP2:%.*]] = xor <8 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
+; MULTI-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
+; MULTI-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP8]], i32 4, <8 x i1> [[TMP2]])
+; MULTI-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; MULTI-NEXT:    [[TMP5:%.*]] = xor <8 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]])
+; MULTI-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; MULTI-NEXT:    br i1 [[TMP6]], label %[[E1:.*]], label %[[VECTOR_BODY_SPLIT]]
+; MULTI:       [[VECTOR_BODY_SPLIT]]:
+; MULTI-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; MULTI-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; MULTI:       [[MIDDLE_BLOCK]]:
+; MULTI-NEXT:    br i1 true, label %[[E2:.*]], label %[[SCALAR_PH]]
+; MULTI:       [[SCALAR_PH]]:
+; MULTI-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MULTI-NEXT:    br label %[[LOOP_HEADER:.*]]
+; MULTI:       [[LOOP_HEADER]]:
+; MULTI-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; MULTI-NEXT:    [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
+; MULTI-NEXT:    br i1 [[CMP1]], label %[[E1]], label %[[LOOP_LATCH]]
+; MULTI:       [[LOOP_LATCH]]:
+; MULTI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
+; MULTI-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
+; MULTI-NEXT:    [[INC]] = add nuw i64 [[I_07]], 1
+; MULTI-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
+; MULTI-NEXT:    br i1 [[CMP_NOT]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
+; MULTI:       [[E1]]:
+; MULTI-NEXT:    ret i64 0
+; MULTI:       [[E2]]:
+; MULTI-NEXT:    ret i64 1
+;
+; DEFAULT-LABEL: define i64 @multi_exit_with_store(
+; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT:  [[ENTRY:.*]]:
+; DEFAULT-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
+; DEFAULT-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 8
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT:       [[VECTOR_PH]]:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 8
+; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; DEFAULT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP5]], i64 8, i64 [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP2]]
+; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
+; DEFAULT:       [[VECTOR_BODY]]:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; DEFAULT-NEXT:    store <8 x i32> zeroinitializer, ptr [[TMP3]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT:       [[MIDDLE_BLOCK]]:
+; DEFAULT-NEXT:    br label %[[SCALAR_PH]]
+; DEFAULT:       [[SCALAR_PH]]:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT:    br label %[[LOOP_HEADER:.*]]
+; DEFAULT:       [[LOOP_HEADER]]:
+; DEFAULT-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; DEFAULT-NEXT:    [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
+; DEFAULT-NEXT:    br i1 [[CMP1]], label %[[E1:.*]], label %[[LOOP_LATCH]]
+; DEFAULT:       [[LOOP_LATCH]]:
+; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
+; DEFAULT-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
+; DEFAULT-NEXT:    [[INC]] = add nuw i64 [[I_07]], 1
+; DEFAULT-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
+; DEFAULT-NEXT:    br i1 [[CMP_NOT]], label %[[E2:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT:       [[E1]]:
+; DEFAULT-NEXT:    ret i64 0
+; DEFAULT:       [[E2]]:
+; DEFAULT-NEXT:    ret i64 1
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %c.1 = icmp uge i64 %iv, %N
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  store i32 0, ptr %arrayidx
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e2, label %loop.header
+
+e1:
+  ret i64 0
+
+e2:
+  ret i64 1
+
+}
+;.
+; MULTI: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; MULTI: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
+; MULTI: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
index cd128979fc1431..1c02f10753745c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
@@ -5,18 +5,18 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64
 ; CHECK-LABEL: define i64 @test_value_in_exit_compare_chain_used_outside(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[X:%.*]], i64 range(i64 1, 32) [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[TMP0]]
-; CHECK-NEXT:    [[UMIN2:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[X]])
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[UMIN2]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
-; CHECK:       [[VECTOR_SCEVCHECK]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[N]], -1
 ; CHECK-NEXT:    [[TMP4:%.*]] = freeze i64 [[TMP3]]
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[X]])
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[UMIN]] to i1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[UMIN]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = add nsw i64 [[N]], -1
+; CHECK-NEXT:    [[TMP33:%.*]] = freeze i64 [[TMP32]]
+; CHECK-NEXT:    [[UMIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP33]], i64 [[X]])
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[UMIN1]] to i1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[UMIN1]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
new file mode 100644
index 00000000000000..06672680ff715c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-multi-cond-vectorization -debug %s 2>&1 | FileCheck %s
+
+define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={2,4,8},UF={1}' {
+; CHECK-NEXT: Live-in vp<%0> = VF
+; CHECK-NEXT: Live-in vp<%1> = VF * UF
+; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%7>
+; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<%0>
+; CHECK-NEXT:     vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT:     EMIT vp<%5> = not ir<%c.1>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<%4>
+; CHECK-NEXT:     vp<%6> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<%6>, ir<0>, vp<%5>
+; CHECK-NEXT:     EMIT vp<%7> = add nuw vp<%3>, vp<%1>
+; CHECK-NEXT:     EMIT vp<%8> = not vp<%5>
+; CHECK-NEXT:     EMIT vp<%9> = any-of vp<%8>
+ ; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%7>, vp<%2>
+; CHECK-NEXT:     EMIT branch-on-multi-cond vp<%9>, vp<%10>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT:   IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<%12> = icmp eq ir<128>, vp<%2>
+; CHECK-NEXT:   EMIT branch-on-cond vp<%12>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT:  IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %c.1 = icmp uge i64 %iv, %N
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  store i32 0, ptr %arrayidx
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e2, label %loop.header
+
+e1:
+  %p1 = phi i64 [ 0, %loop.header ]
+  ret i64 %p1
+
+e2:
+  %p2 = phi i64 [ 1, %loop.latch ]
+  ret i64 %p2
+}
+
+define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={2,4,8},UF={1}' {
+; CHECK-NEXT: Live-in vp<%0> = VF
+; CHECK-NEXT: Live-in vp<%1> = VF * UF
+; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%7>
+; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<%0>
+; CHECK-NEXT:     vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT:     EMIT vp<%5> = not ir<%c.1>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<%4>
+; CHECK-NEXT:     vp<%6> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<%6>, ir<0>, vp<%5>
+; CHECK-NEXT:     EMIT vp<%7> = add nuw vp<%3>, vp<%1>
+; CHECK-NEXT:     EMIT vp<%8> = not vp<%5>
+; CHECK-NEXT:     EMIT vp<%9> = any-of vp<%8>
+ ; CHECK-NEXT:    EMIT vp<%10> = icmp eq vp<%7>, vp<%2>
+; CHECK-NEXT:     EMIT branch-on-multi-cond vp<%9>, vp<%10>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): ir-bb<e>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e>:
+; CHECK-NEXT:    IR   %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<0>, ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<%12> = icmp eq ir<128>, vp<%2>
+; CHECK-NEXT:   EMIT branch-on-cond vp<%12>
+; CHECK-NEXT: Successor(s): ir-bb<e>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %c.1 = icmp uge i64 %iv, %N
+  br i1 %c.1, label %e, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  store i32 0, ptr %arrayidx
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e, label %loop.header
+
+e:
+  %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
+  ret i64 %p1
+}



More information about the llvm-commits mailing list