[llvm] [VPlan] Dispatch to multiple exit blocks via middle blocks. (PR #112138)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Sun Dec 8 12:52:28 PST 2024


https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/112138

>From 245b56a80bca6369a9be3102308617f2a4a4d51b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 1 Oct 2024 20:57:24 +0100
Subject: [PATCH 01/15] [VPlan] Support VPIRBBs and VPIRInst phis with multiple
 predecessors.

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp        | 5 ++++-
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 9 +++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c1b97791331bcf..8609514c39e7d0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1056,7 +1056,10 @@ void VPlan::execute(VPTransformState *State) {
   State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}});
 
   // Generate code in the loop pre-header and body.
-  for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      Entry);
+
+  for (VPBlockBase *Block : RPOT)
     Block->execute(State);
 
   VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 41f13cc2d9a978..be3e958320e771 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -857,12 +857,13 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
 void VPIRInstruction::execute(VPTransformState &State) {
   assert((isa<PHINode>(&I) || getNumOperands() == 0) &&
          "Only PHINodes can have extra operands");
-  if (getNumOperands() == 1) {
-    VPValue *ExitValue = getOperand(0);
+  for (const auto &[Idx, Op] : enumerate(operands())) {
+    VPValue *ExitValue = Op;
     auto Lane = vputils::isUniformAfterVectorization(ExitValue)
                     ? VPLane::getFirstLane()
                     : VPLane::getLastLaneForVF(State.VF);
-    auto *PredVPBB = cast<VPBasicBlock>(getParent()->getSinglePredecessor());
+    VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
+    auto *PredVPBB = Pred->getExitingBasicBlock();
     BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
     // Set insertion point in PredBB in case an extract needs to be generated.
     // TODO: Model extracts explicitly.
@@ -890,7 +891,7 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
   O << Indent << "IR " << I;
 
   if (getNumOperands() != 0) {
-    assert(getNumOperands() == 1 && "can have at most 1 operand");
+    // assert(getNumOperands() == 1 && "can have at most 1 operand");
     O << " (extra operand: ";
     printOperands(O, SlotTracker);
     O << ")";

>From 47258deea863675e43fd7fd48376dce131441dc5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 18 Sep 2024 21:35:57 +0100
Subject: [PATCH 02/15] [VPlan] Dispatch to multiple exit blocks via middle
 blocks.

A more lightweight variant of https://github.com/llvm/llvm-project/pull/109193,
which dispatches to multiple exit blocks via the middle blocks.
---
 .../Vectorize/LoopVectorizationLegality.h     |   3 +
 .../Vectorize/LoopVectorizationLegality.cpp   |  29 +++
 .../Transforms/Vectorize/LoopVectorize.cpp    |  82 +++---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  39 ++-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   1 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  16 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  82 ++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |   4 +
 .../Transforms/Vectorize/VPlanVerifier.cpp    |   8 -
 .../LoopVectorize/X86/multi-exit-codegen.ll   | 240 ++++++++++++++++++
 .../LoopVectorize/X86/multi-exit-cost.ll      |  18 +-
 .../LoopVectorize/X86/multi-exit-vplan.ll     | 148 +++++++++++
 12 files changed, 614 insertions(+), 56 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index dc7e484a40a452..af6fae44cf0f09 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -287,6 +287,9 @@ class LoopVectorizationLegality {
   /// we can use in-order reductions.
   bool canVectorizeFPMath(bool EnableStrictReductions);
 
+  /// Returns true if the loop has an early exit that we can vectorize.
+  bool canVectorizeEarlyExit() const;
+
   /// Return true if we can vectorize this loop while folding its tail by
   /// masking.
   bool canFoldTailByMasking() const;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 43be72f0f34d45..ee53d28a4c8282 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -43,6 +43,10 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
                        cl::desc("Enable recognition of non-constant strided "
                                 "pointer induction variables."));
 
+static cl::opt<bool>
+    EnableEarlyExitVectorization("enable-early-exit-vectorization",
+                                 cl::init(false), cl::Hidden, cl::desc(""));
+
 namespace llvm {
 cl::opt<bool>
     HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
@@ -1378,6 +1382,10 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
 }
 
 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
+  // When vectorizing early exits, create predicates for all blocks, except the
+  // header.
+  if (canVectorizeEarlyExit() && BB != TheLoop->getHeader())
+    return true;
   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
 }
 
@@ -1514,6 +1522,27 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   return true;
 }
 
+bool LoopVectorizationLegality::canVectorizeEarlyExit() const {
+  // Currently only allow vectorizing loops with early exits, if early-exit
+  // vectorization is explicitly enabled and the loop has metadata to force
+  // vectorization.
+  if (!EnableEarlyExitVectorization)
+    return false;
+
+  SmallVector<BasicBlock *> Exiting;
+  TheLoop->getExitingBlocks(Exiting);
+  if (Exiting.size() == 1)
+    return false;
+
+  LoopVectorizeHints Hints(TheLoop, true, *ORE);
+  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+    return false;
+
+  Function *Fn = TheLoop->getHeader()->getParent();
+  return Hints.allowVectorization(Fn, TheLoop,
+                                  true /*VectorizeOnlyWhenForced*/);
+}
+
 // Helper function to canVectorizeLoopNestCFG.
 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
                                                     bool UseVPlanNativePath) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8653498d32a12..befe8f7c0076a3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1363,9 +1363,11 @@ class LoopVectorizationCostModel {
     // If we might exit from anywhere but the latch, must run the exiting
     // iteration in scalar form.
     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-      LLVM_DEBUG(
-          dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
-      return true;
+      if (!Legal->canVectorizeEarlyExit()) {
+        LLVM_DEBUG(
+            dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+        return true;
+      }
     }
     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
@@ -2575,7 +2577,8 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   assert(LoopVectorPreHeader && "Invalid loop structure");
   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
-  assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
+  assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector()) ||
+          Legal->canVectorizeEarlyExit()) &&
          "multiple exit loop without required epilogue?");
 
   LoopMiddleBlock =
@@ -2758,8 +2761,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
   // value (the value that feeds into the phi from the loop latch).
   // We allow both, but they, obviously, have different values.
 
-  assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
-
   DenseMap<Value *, Value *> MissingVals;
 
   // An external user of the last iteration's value should see the value that
@@ -2819,6 +2820,9 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
       PHI->addIncoming(I.second, MiddleBlock);
   }
+
+  assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) &&
+         "Expected a single exit block");
 }
 
 namespace {
@@ -3599,7 +3603,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   TheLoop->getExitingBlocks(Exiting);
   for (BasicBlock *E : Exiting) {
     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
-    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() &&
+        (TheLoop->getLoopLatch() == E || !Legal->canVectorizeEarlyExit()))
       AddToWorklistIfAllowed(Cmp);
   }
 
@@ -7692,12 +7697,15 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   BestVPlan.execute(&State);
 
   // 2.5 Collect reduction resume values.
-  auto *ExitVPBB =
-      cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
-  for (VPRecipeBase &R : *ExitVPBB) {
-    createAndCollectMergePhiForReduction(
-        dyn_cast<VPInstruction>(&R), State, OrigLoop,
-        State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+  VPBasicBlock *ExitVPBB = nullptr;
+  if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {
+    ExitVPBB = cast<VPBasicBlock>(
+        BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+    for (VPRecipeBase &R : *ExitVPBB) {
+      createAndCollectMergePhiForReduction(
+          dyn_cast<VPInstruction>(&R), State, OrigLoop,
+          State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+    }
   }
 
   // 2.6. Maintain Loop Hints
@@ -7723,6 +7731,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     LoopVectorizeHints Hints(L, true, *ORE);
     Hints.setAlreadyVectorized();
   }
+
   TargetTransformInfo::UnrollingPreferences UP;
   TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
   if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
@@ -7735,15 +7744,17 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   ILV.printDebugTracesAtEnd();
 
   // 4. Adjust branch weight of the branch in the middle block.
-  auto *MiddleTerm =
-      cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
-  if (MiddleTerm->isConditional() &&
-      hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
-    // Assume that `Count % VectorTripCount` is equally distributed.
-    unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
-    assert(TripCount > 0 && "trip count should not be zero");
-    const uint32_t Weights[] = {1, TripCount - 1};
-    setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+  if (ExitVPBB) {
+    auto *MiddleTerm =
+        cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
+    if (MiddleTerm->isConditional() &&
+        hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+      // Assume that `Count % VectorTripCount` is equally distributed.
+      unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
+      assert(TripCount > 0 && "trip count should not be zero");
+      const uint32_t Weights[] = {1, TripCount - 1};
+      setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+    }
   }
 
   return State.ExpandedSCEVs;
@@ -8128,7 +8139,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   // If source is an exiting block, we know the exit edge is dynamically dead
   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
   // adding uses of an otherwise potentially dead instruction.
-  if (OrigLoop->isLoopExiting(Src))
+  if (!Legal->canVectorizeEarlyExit() && OrigLoop->isLoopExiting(Src))
     return EdgeMaskCache[Edge] = SrcMask;
 
   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8778,6 +8789,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 static SetVector<VPIRInstruction *> collectUsersInExitBlock(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+  if (!Plan.getVectorLoopRegion()->getSingleSuccessor())
+    return {};
   auto *MiddleVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
   // No edge from the middle block to the unique exit block has been inserted
@@ -8863,6 +8876,8 @@ static void addLiveOutsForFirstOrderRecurrences(
   // TODO: Should be replaced by
   // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
   // scalar region is modeled as well.
+  if (!VectorRegion->getSingleSuccessor())
+    return;
   auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
   VPBasicBlock *ScalarPHVPBB = nullptr;
   if (MiddleVPBB->getNumSuccessors() == 2) {
@@ -9146,10 +9161,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
          "VPBasicBlock");
   RecipeBuilder.fixHeaderPhis();
 
-  SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
-      OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
-  addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
-  addUsersInExitBlock(*Plan, ExitUsersToFix);
+  if (Legal->canVectorizeEarlyExit()) {
+    VPlanTransforms::convertToMultiCond(*Plan, *PSE.getSE(), OrigLoop,
+                                        RecipeBuilder);
+  } else {
+    SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
+        OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+    addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
+    addUsersInExitBlock(*Plan, ExitUsersToFix);
+  }
 
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -9277,8 +9297,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   using namespace VPlanPatternMatch;
   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
-  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
   for (VPRecipeBase &R : Header->phis()) {
     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
     if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
@@ -9297,8 +9315,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       for (VPUser *U : Cur->users()) {
         auto *UserRecipe = cast<VPSingleDefRecipe>(U);
         if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
-          assert(UserRecipe->getParent() == MiddleVPBB &&
-                 "U must be either in the loop region or the middle block.");
           continue;
         }
         Worklist.insert(UserRecipe);
@@ -9403,6 +9419,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   }
   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
   Builder.setInsertPoint(&*LatchVPBB->begin());
+  if (!VectorLoopRegion->getSingleSuccessor())
+    return;
+  VPBasicBlock *MiddleVPBB =
+      cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
   for (VPRecipeBase &R :
        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8609514c39e7d0..eb7c808551340d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -474,6 +474,14 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
     // backedges. A backward successor is set when the branch is created.
     const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
     unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+    if (TermBr->getSuccessor(idx) &&
+        PredVPBlock == getPlan()->getVectorLoopRegion() &&
+        PredVPBlock->getNumSuccessors()) {
+      // Update PRedBB and TermBr for BranchOnMultiCond in predecessor.
+      PredBB = TermBr->getSuccessor(1);
+      TermBr = cast<BranchInst>(PredBB->getTerminator());
+      idx = 0;
+    }
     assert(!TermBr->getSuccessor(idx) &&
            "Trying to reset an existing successor block.");
     TermBr->setSuccessor(idx, IRBB);
@@ -908,8 +916,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
 
-  VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
   if (!RequiresScalarEpilogueCheck) {
+    VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
     VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
     return Plan;
   }
@@ -923,10 +931,14 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
   BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
-  auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
-  // The connection order corresponds to the operands of the conditional branch.
-  VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
-  VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+  if (IRExitBlock) {
+    auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+    // The connection order corresponds to the operands of the conditional
+    // branch.
+    VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
+    VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+    VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+  }
 
   auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
   // Here we use the same DebugLoc as the scalar loop latch terminator instead
@@ -1031,7 +1043,9 @@ void VPlan::execute(VPTransformState *State) {
   // VPlan execution rather than earlier during VPlan construction.
   BasicBlock *MiddleBB = State->CFG.ExitBB;
   VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+      getVectorLoopRegion()->getNumSuccessors() == 1
+          ? cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[0])
+          : cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[1]);
   // Find the VPBB for the scalar preheader, relying on the current structure
   // when creating the middle block and its successrs: if there's a single
   // predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1044,6 +1058,10 @@ void VPlan::execute(VPTransformState *State) {
       MiddleSuccs.size() == 1 ? MiddleSuccs[0] : MiddleSuccs[1]);
   assert(!isa<VPIRBasicBlock>(ScalarPhVPBB) &&
          "scalar preheader cannot be wrapped already");
+  if (ScalarPhVPBB->getNumSuccessors() != 0) {
+    ScalarPhVPBB = cast<VPBasicBlock>(ScalarPhVPBB->getSuccessors()[1]);
+    MiddleVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
+  }
   replaceVPBBWithIRVPBB(ScalarPhVPBB, ScalarPh);
   replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
 
@@ -1065,6 +1083,10 @@ void VPlan::execute(VPTransformState *State) {
   VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
   BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
 
+  if (!getVectorLoopRegion()->getSingleSuccessor())
+    VectorLatchBB =
+        cast<BranchInst>(VectorLatchBB->getTerminator())->getSuccessor(1);
+
   // Fix the latch value of canonical, reduction and first-order recurrences
   // phis in the vector loop.
   VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
@@ -1091,7 +1113,10 @@ void VPlan::execute(VPTransformState *State) {
       // Move the last step to the end of the latch block. This ensures
       // consistent placement of all induction updates.
       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
-      Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+      if (VectorLatchBB->getTerminator() == &*VectorLatchBB->getFirstNonPHI())
+        Inc->moveBefore(VectorLatchBB->getTerminator());
+      else
+        Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
 
       // Use the steps for the last part as backedge value for the induction.
       if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 59a084401cc9bf..21f44eac188936 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1274,6 +1274,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
     PtrAdd,
+    AnyOf,
   };
 
 private:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index be3e958320e771..9d5c609ad26043 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -67,6 +67,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
     default:
       return true;
     }
+  case VPExpandSCEVSC:
+    return getParent()->getPlan()->getTripCount() == getVPSingleValue();
   case VPInterleaveSC:
     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
@@ -160,6 +162,8 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPPredInstPHISC:
   case VPScalarCastSC:
     return false;
+  case VPExpandSCEVSC:
+    return getParent()->getPlan()->getTripCount() == getVPSingleValue();
   case VPInstructionSC:
     return mayWriteToMemory();
   case VPWidenCallSC: {
@@ -399,6 +403,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
+  case VPInstruction::AnyOf:
     return true;
   default:
     return false;
@@ -674,6 +679,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
     }
     return NewPhi;
   }
+  case VPInstruction::AnyOf: {
+    Value *A = State.get(getOperand(0));
+    return Builder.CreateOrReduce(A);
+  }
 
   default:
     llvm_unreachable("Unsupported opcode for instruction");
@@ -682,7 +691,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractFromEnd ||
-         getOpcode() == VPInstruction::ComputeReductionResult;
+         getOpcode() == VPInstruction::ComputeReductionResult ||
+         getOpcode() == VPInstruction::AnyOf;
 }
 
 bool VPInstruction::isSingleScalar() const {
@@ -745,6 +755,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
     return false;
   case Instruction::ICmp:
   case Instruction::Select:
+  case Instruction::Or:
   case VPInstruction::PtrAdd:
     // TODO: Cover additional opcodes.
     return vputils::onlyFirstLaneUsed(this);
@@ -840,6 +851,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::PtrAdd:
     O << "ptradd";
     break;
+  case VPInstruction::AnyOf:
+    O << "any-of";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d50f3c0c3f3e04..a86498eb9aa30c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -515,6 +515,12 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
       Plan.getEntry());
 
+  for (VPRecipeBase &R : make_early_inc_range(
+           reverse(*cast<VPBasicBlock>(Plan.getPreheader())))) {
+    if (isDeadRecipe(R))
+      R.eraseFromParent();
+  }
+
   for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
     // The recipes in the block are processed in reverse order, to catch chains
     // of dead recipes.
@@ -1696,3 +1702,79 @@ void VPlanTransforms::createInterleaveGroups(
       }
   }
 }
+
+void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
+                                         Loop *OrigLoop,
+                                         VPRecipeBuilder &RecipeBuilder) {
+  auto *LatchVPBB =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
+  VPBuilder Builder(LatchVPBB->getTerminator());
+  auto *MiddleVPBB =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+
+  const SCEV *BackedgeTakenCount =
+      SE.getExitCount(OrigLoop, OrigLoop->getLoopLatch());
+  const SCEV *TripCount = SE.getTripCountFromExitCount(
+      BackedgeTakenCount, Plan.getCanonicalIV()->getScalarType(), OrigLoop);
+  VPValue *NewTC = vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE);
+  Plan.getTripCount()->replaceAllUsesWith(NewTC);
+  Plan.resetTripCount(NewTC);
+
+  VPValue *EarlyExitTaken = nullptr;
+  SmallVector<BasicBlock *> ExitingBBs;
+  OrigLoop->getExitingBlocks(ExitingBBs);
+  for (BasicBlock *Exiting : ExitingBBs) {
+    auto *ExitingTerm = cast<BranchInst>(Exiting->getTerminator());
+    BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
+    BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
+    VPIRBasicBlock *VPExitBlock;
+    if (OrigLoop->getUniqueExitBlock())
+      VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
+    else
+      VPExitBlock = VPIRBasicBlock::fromBasicBlock(
+          !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+
+    for (VPRecipeBase &R : *VPExitBlock) {
+      auto *ExitIRI = cast<VPIRInstruction>(&R);
+      auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
+      if (!ExitPhi)
+        break;
+      Value *IncomingValue = ExitPhi->getIncomingValueForBlock(Exiting);
+      VPValue *V = RecipeBuilder.getVPValueOrAddLiveIn(IncomingValue);
+      ExitIRI->addOperand(V);
+    }
+
+    if (Exiting == OrigLoop->getLoopLatch()) {
+      if (MiddleVPBB->getNumSuccessors() == 0) {
+        VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+        VPBlockUtils::connectBlocks(MiddleVPBB, VPExitBlock);
+        VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+      }
+      continue;
+    }
+
+    VPValue *M = RecipeBuilder.getBlockInMask(
+        OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+    auto *N = Builder.createNot(M);
+    EarlyExitTaken = Builder.createNaryOp(VPInstruction::AnyOf, {N});
+
+    VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split");
+    VPBlockUtils::disconnectBlocks(LoopRegion, MiddleVPBB);
+    VPBlockUtils::insertBlockAfter(NewMiddle, LoopRegion);
+    VPBlockUtils::connectBlocks(NewMiddle, VPExitBlock);
+    VPBlockUtils::connectBlocks(NewMiddle, MiddleVPBB);
+
+    VPBuilder MiddleBuilder(NewMiddle);
+    MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {EarlyExitTaken});
+    // MiddleVPBB = NewMiddle;
+  }
+  auto *Term = dyn_cast<VPInstruction>(LatchVPBB->getTerminator());
+  auto *IsLatchExiting = Builder.createICmp(
+      CmpInst::ICMP_EQ, Term->getOperand(0), Term->getOperand(1));
+  auto *AnyExiting =
+      Builder.createNaryOp(Instruction::Or, {EarlyExitTaken, IsLatchExiting});
+  Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExiting);
+  Term->eraseFromParent();
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 60a44bfb0dca6b..9745211db275f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -123,6 +123,10 @@ struct VPlanTransforms {
 
   /// Remove dead recipes from \p Plan.
   static void removeDeadRecipes(VPlan &Plan);
+
+  static void convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
+                                 Loop *OrigLoop,
+                                 VPRecipeBuilder &RecipeBuilder);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 7ea5ee341cc547..1ac79f8887ab46 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -248,14 +248,6 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
     return false;
   }
 
-  VPBlockBase *MiddleBB =
-      IRBB->getPlan()->getVectorLoopRegion()->getSingleSuccessor();
-  if (IRBB != IRBB->getPlan()->getPreheader() &&
-      IRBB->getSinglePredecessor() != MiddleBB) {
-    errs() << "VPIRBasicBlock can only be used as pre-header or a successor of "
-              "middle-block at the moment!\n";
-    return false;
-  }
   return true;
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
new file mode 100644
index 00000000000000..0c33715c6bd271
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization %s | FileCheck --check-prefix=MULTI %s
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization=false %s | FileCheck --check-prefix=DEFAULT %s
+
+define i64 @multi_exit_with_store(ptr %p, i64 %N) {
+; MULTI-LABEL: define i64 @multi_exit_with_store(
+; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; MULTI-NEXT:  [[ENTRY:.*]]:
+; MULTI-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; MULTI:       [[VECTOR_PH]]:
+; MULTI-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; MULTI-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; MULTI-NEXT:    br label %[[VECTOR_BODY:.*]]
+; MULTI:       [[VECTOR_BODY]]:
+; MULTI-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; MULTI-NEXT:    [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; MULTI-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
+; MULTI-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
+; MULTI-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]])
+; MULTI-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; MULTI-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; MULTI-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; MULTI-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; MULTI-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; MULTI-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; MULTI:       [[MIDDLE_SPLIT]]:
+; MULTI-NEXT:    br i1 [[TMP6]], label %[[E1:.*]], label %[[MIDDLE_BLOCK:.*]]
+; MULTI:       [[MIDDLE_BLOCK]]:
+; MULTI-NEXT:    br i1 true, label %[[E2:.*]], label %[[SCALAR_PH]]
+; MULTI:       [[SCALAR_PH]]:
+; MULTI-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MULTI-NEXT:    br label %[[LOOP_HEADER:.*]]
+; MULTI:       [[LOOP_HEADER]]:
+; MULTI-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; MULTI-NEXT:    [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
+; MULTI-NEXT:    br i1 [[CMP1]], label %[[E1]], label %[[LOOP_LATCH]]
+; MULTI:       [[LOOP_LATCH]]:
+; MULTI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
+; MULTI-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
+; MULTI-NEXT:    [[INC]] = add nuw i64 [[I_07]], 1
+; MULTI-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
+; MULTI-NEXT:    br i1 [[CMP_NOT]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; MULTI:       [[E1]]:
+; MULTI-NEXT:    ret i64 0
+; MULTI:       [[E2]]:
+; MULTI-NEXT:    ret i64 1
+;
+; DEFAULT-LABEL: define i64 @multi_exit_with_store(
+; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT:  [[ENTRY:.*]]:
+; DEFAULT-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
+; DEFAULT-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 4
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT:       [[VECTOR_PH]]:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
+; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; DEFAULT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP5]], i64 4, i64 [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP2]]
+; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
+; DEFAULT:       [[VECTOR_BODY]]:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; DEFAULT-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP3]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT:       [[MIDDLE_BLOCK]]:
+; DEFAULT-NEXT:    br label %[[SCALAR_PH]]
+; DEFAULT:       [[SCALAR_PH]]:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT:    br label %[[LOOP_HEADER:.*]]
+; DEFAULT:       [[LOOP_HEADER]]:
+; DEFAULT-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; DEFAULT-NEXT:    [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
+; DEFAULT-NEXT:    br i1 [[CMP1]], label %[[E1:.*]], label %[[LOOP_LATCH]]
+; DEFAULT:       [[LOOP_LATCH]]:
+; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
+; DEFAULT-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
+; DEFAULT-NEXT:    [[INC]] = add nuw i64 [[I_07]], 1
+; DEFAULT-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
+; DEFAULT-NEXT:    br i1 [[CMP_NOT]], label %[[E2:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT:       [[E1]]:
+; DEFAULT-NEXT:    ret i64 0
+; DEFAULT:       [[E2]]:
+; DEFAULT-NEXT:    ret i64 1
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %c.1 = icmp uge i64 %iv, %N
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  store i32 0, ptr %arrayidx
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
+
+e1:
+  ret i64 0
+
+e2:
+  ret i64 1
+}
+
+define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
+; MULTI-LABEL: define i64 @multi_exiting_to_same_exit_with_store(
+; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; MULTI-NEXT:  [[ENTRY:.*]]:
+; MULTI-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; MULTI:       [[VECTOR_PH]]:
+; MULTI-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; MULTI-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; MULTI-NEXT:    br label %[[VECTOR_BODY:.*]]
+; MULTI:       [[VECTOR_BODY]]:
+; MULTI-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MULTI-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; MULTI-NEXT:    [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; MULTI-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
+; MULTI-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
+; MULTI-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]])
+; MULTI-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; MULTI-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
+; MULTI-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; MULTI-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; MULTI-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; MULTI-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; MULTI-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; MULTI:       [[MIDDLE_SPLIT]]:
+; MULTI-NEXT:    br i1 [[TMP6]], label %[[E:.*]], label %[[MIDDLE_BLOCK:.*]]
+; MULTI:       [[MIDDLE_BLOCK]]:
+; MULTI-NEXT:    br i1 true, label %[[E]], label %[[SCALAR_PH]]
+; MULTI:       [[SCALAR_PH]]:
+; MULTI-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; MULTI-NEXT:    br label %[[LOOP_HEADER:.*]]
+; MULTI:       [[LOOP_HEADER]]:
+; MULTI-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; MULTI-NEXT:    [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]]
+; MULTI-NEXT:    br i1 [[C_1]], label %[[E]], label %[[LOOP_LATCH]]
+; MULTI:       [[LOOP_LATCH]]:
+; MULTI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
+; MULTI-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
+; MULTI-NEXT:    [[INC]] = add nuw i64 [[IV]], 1
+; MULTI-NEXT:    [[C_2:%.*]] = icmp eq i64 [[INC]], 128
+; MULTI-NEXT:    br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; MULTI:       [[E]]:
+; MULTI-NEXT:    [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ 1, %[[MIDDLE_SPLIT]] ]
+; MULTI-NEXT:    ret i64 [[P1]]
+;
+; DEFAULT-LABEL: define i64 @multi_exiting_to_same_exit_with_store(
+; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:  [[ENTRY:.*]]:
+; DEFAULT-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; DEFAULT:       [[VECTOR_PH]]:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; DEFAULT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
+; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
+; DEFAULT:       [[VECTOR_BODY]]:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; DEFAULT-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT:       [[MIDDLE_BLOCK]]:
+; DEFAULT-NEXT:    br label %[[SCALAR_PH]]
+; DEFAULT:       [[SCALAR_PH]]:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; DEFAULT-NEXT:    br label %[[LOOP_HEADER:.*]]
+; DEFAULT:       [[LOOP_HEADER]]:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; DEFAULT-NEXT:    [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]]
+; DEFAULT-NEXT:    br i1 [[C_1]], label %[[E:.*]], label %[[LOOP_LATCH]]
+; DEFAULT:       [[LOOP_LATCH]]:
+; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
+; DEFAULT-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
+; DEFAULT-NEXT:    [[INC]] = add nuw i64 [[IV]], 1
+; DEFAULT-NEXT:    [[C_2:%.*]] = icmp eq i64 [[INC]], 128
+; DEFAULT-NEXT:    br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT:       [[E]]:
+; DEFAULT-NEXT:    [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ]
+; DEFAULT-NEXT:    ret i64 [[P1]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %c.1 = icmp uge i64 %iv, %N
+  br i1 %c.1, label %e, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  store i32 0, ptr %arrayidx
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e, label %loop.header, !llvm.loop !1
+
+e:
+  %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
+  ret i64 %p1
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; MULTI: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; MULTI: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; MULTI: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; MULTI: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; MULTI: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; MULTI: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
index cd128979fc1431..1c02f10753745c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
@@ -5,18 +5,18 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64
 ; CHECK-LABEL: define i64 @test_value_in_exit_compare_chain_used_outside(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[X:%.*]], i64 range(i64 1, 32) [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[TMP0]]
-; CHECK-NEXT:    [[UMIN2:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[X]])
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[UMIN2]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
-; CHECK:       [[VECTOR_SCEVCHECK]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[N]], -1
 ; CHECK-NEXT:    [[TMP4:%.*]] = freeze i64 [[TMP3]]
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[X]])
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[UMIN]] to i1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[UMIN]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = add nsw i64 [[N]], -1
+; CHECK-NEXT:    [[TMP33:%.*]] = freeze i64 [[TMP32]]
+; CHECK-NEXT:    [[UMIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP33]], i64 [[X]])
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[UMIN1]] to i1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[UMIN1]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
new file mode 100644
index 00000000000000..5c5d532b93bc89
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
+
+define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT:   IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT:  IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %c.1 = icmp uge i64 %iv, %N
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  store i32 0, ptr %arrayidx
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
+
+e1:
+  %p1 = phi i64 [ 0, %loop.header ]
+  ret i64 %p1
+
+e2:
+  %p2 = phi i64 [ 1, %loop.latch ]
+  ret i64 %p2
+}
+
+define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e>:
+; CHECK-NEXT:    IR   %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<0>, ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %c.1 = icmp uge i64 %iv, %N
+  br i1 %c.1, label %e, label %loop.latch
+
+loop.latch:
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  store i32 0, ptr %arrayidx
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e, label %loop.header, !llvm.loop !1
+
+e:
+  %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
+  ret i64 %p1
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}

>From 3831acb97053230cb09f8316ce1ada17be50564c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 31 Oct 2024 19:48:40 +0000
Subject: [PATCH 03/15] !fixup address first set of comments, thanks!

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 13 ++-----------
 llvm/lib/Transforms/Vectorize/VPlan.cpp         | 16 +---------------
 2 files changed, 3 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3a69750460f4d8..80a0fda81aeaf1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7711,16 +7711,13 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   BestVPlan.execute(&State);
 
   // 2.5 Collect reduction resume values.
-  VPBasicBlock *ExitVPBB = nullptr;
-  if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) {
-    ExitVPBB = cast<VPBasicBlock>(
-        BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
+  VPBasicBlock *ExitVPBB =
+      cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
   if (VectorizingEpilogue)
     for (VPRecipeBase &R : *ExitVPBB) {
       fixReductionScalarResumeWhenVectorizingEpilog(
           &R, State, State.CFG.VPBB2IRBB[ExitVPBB]);
     }
-  }
 
   // 2.6. Maintain Loop Hints
   // Keep all loop hints from the original loop on the vector loop (we'll
@@ -8809,8 +8806,6 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 static SetVector<VPIRInstruction *> collectUsersInExitBlock(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
-  if (!Plan.getVectorLoopRegion()->getSingleSuccessor())
-    return {};
   auto *MiddleVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
   // No edge from the middle block to the unique exit block has been inserted
@@ -8896,8 +8891,6 @@ static void addLiveOutsForFirstOrderRecurrences(
   // TODO: Should be replaced by
   // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
   // scalar region is modeled as well.
-  if (!VectorRegion->getSingleSuccessor())
-    return;
   auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
   VPBasicBlock *ScalarPHVPBB = nullptr;
   if (MiddleVPBB->getNumSuccessors() == 2) {
@@ -9447,8 +9440,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   }
   VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
   Builder.setInsertPoint(&*LatchVPBB->begin());
-  if (!VectorLoopRegion->getSingleSuccessor())
-    return;
   VPBasicBlock *MiddleVPBB =
       cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor());
   VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index be3579b4cd5dde..4032468b4f76da 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -474,14 +474,6 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
     // backedges. A backward successor is set when the branch is created.
     const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
     unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
-    if (TermBr->getSuccessor(idx) &&
-        PredVPBlock == getPlan()->getVectorLoopRegion() &&
-        PredVPBlock->getNumSuccessors()) {
-      // Update PRedBB and TermBr for BranchOnMultiCond in predecessor.
-      PredBB = TermBr->getSuccessor(1);
-      TermBr = cast<BranchInst>(PredBB->getTerminator());
-      idx = 0;
-    }
     assert(!TermBr->getSuccessor(idx) &&
            "Trying to reset an existing successor block.");
     TermBr->setSuccessor(idx, IRBB);
@@ -1043,9 +1035,7 @@ void VPlan::execute(VPTransformState *State) {
   // VPlan execution rather than earlier during VPlan construction.
   BasicBlock *MiddleBB = State->CFG.ExitBB;
   VPBasicBlock *MiddleVPBB =
-      getVectorLoopRegion()->getNumSuccessors() == 1
-          ? cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[0])
-          : cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[1]);
+      cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
   // Find the VPBB for the scalar preheader, relying on the current structure
   // when creating the middle block and its successrs: if there's a single
   // predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1083,10 +1073,6 @@ void VPlan::execute(VPTransformState *State) {
   VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
   BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
 
-  if (!getVectorLoopRegion()->getSingleSuccessor())
-    VectorLatchBB =
-        cast<BranchInst>(VectorLatchBB->getTerminator())->getSuccessor(1);
-
   // Fix the latch value of canonical, reduction and first-order recurrences
   // phis in the vector loop.
   VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();

>From 64db0eea4073e1cdc3d394155754ed0653ca0c3d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 4 Nov 2024 21:10:24 +0000
Subject: [PATCH 04/15] !fixup clean up merge failures

---
 llvm/lib/Transforms/Vectorize/VPlan.cpp        | 18 ++++++++++--------
 llvm/lib/Transforms/Vectorize/VPlan.h          |  4 ++--
 .../Transforms/Vectorize/VPlanTransforms.cpp   | 11 ++---------
 .../LoopVectorize/X86/multi-exit-vplan.ll      | 11 ++++++++++-
 4 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 924b63bbd9639f..e2c063928e9906 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -911,7 +911,6 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
   VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader);
   if (!RequiresScalarEpilogueCheck) {
-    VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
     VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
     return Plan;
   }
@@ -925,14 +924,17 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
   BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
-  if (IRExitBlock) {
-    auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
-    // The connection order corresponds to the operands of the conditional
-    // branch.
-    VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
-    VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
-    VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+  if (!IRExitBlock) {
+    auto *Term = cast<BranchInst>(TheLoop->getLoopLatch()->getTerminator());
+    IRExitBlock = TheLoop->contains(Term->getSuccessor(0))
+                      ? Term->getSuccessor(1)
+                      : Term->getSuccessor(0);
   }
+  auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
+  // The connection order corresponds to the operands of the conditional
+  // branch.
+  VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
+  VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
 
   auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
   // Here we use the same DebugLoc as the scalar loop latch terminator instead
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 861fea7ff469ff..8efa648a7e1ea7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3822,10 +3822,10 @@ class VPlan {
   /// whether to execute the scalar tail loop or the exit block from the loop
   /// latch.
   const VPBasicBlock *getMiddleBlock() const {
-    return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+    return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
   }
   VPBasicBlock *getMiddleBlock() {
-    return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
+    return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
   }
 
   /// The trip count of the original loop.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index eed3efd666e712..1369693b01971c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1797,8 +1797,7 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
   auto *LatchVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
   VPBuilder Builder(LatchVPBB->getTerminator());
-  auto *MiddleVPBB =
-      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+  auto *MiddleVPBB = Plan.getMiddleBlock();
 
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
 
@@ -1818,7 +1817,7 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
     BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
     BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
     VPIRBasicBlock *VPExitBlock;
-    if (OrigLoop->getUniqueExitBlock())
+    if (OrigLoop->getUniqueExitBlock() || Exiting == OrigLoop->getLoopLatch())
       VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
     else
       VPExitBlock = VPIRBasicBlock::fromBasicBlock(
@@ -1835,11 +1834,6 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
     }
 
     if (Exiting == OrigLoop->getLoopLatch()) {
-      if (MiddleVPBB->getNumSuccessors() == 0) {
-        VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
-        VPBlockUtils::connectBlocks(MiddleVPBB, VPExitBlock);
-        VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
-      }
       continue;
     }
 
@@ -1856,7 +1850,6 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
 
     VPBuilder MiddleBuilder(NewMiddle);
     MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {EarlyExitTaken});
-    // MiddleVPBB = NewMiddle;
   }
   auto *Term = dyn_cast<VPInstruction>(LatchVPBB->getTerminator());
   auto *IsLatchExiting = Builder.createICmp(
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
index 5c5d532b93bc89..47304c571bfcb1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
@@ -49,9 +49,13 @@ define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %c.1 = icmp uge i64 %iv, %N
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
-;
 entry:
   br label %loop.header
 
@@ -120,6 +124,11 @@ define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
 ; CHECK-NEXT: Successor(s): ir-bb<e>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %c.1 = icmp uge i64 %iv, %N
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
 ;

>From 0f8aedfaf89bc6dbe18bd00e6bad0aad52db10f3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 5 Nov 2024 13:50:43 +0000
Subject: [PATCH 05/15] !fixup address latest comments, thanks!

---
 .../Vectorize/LoopVectorizationLegality.cpp   |   8 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 121 ++++++++++--------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  13 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  48 +++----
 .../Transforms/Vectorize/VPlanTransforms.h    |   6 +-
 .../LoopVectorize/X86/multi-exit-vplan.ll     |  82 +-----------
 6 files changed, 98 insertions(+), 180 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 0d8bda5a2112c7..ed3808d2f30bf1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -43,10 +43,6 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden,
                        cl::desc("Enable recognition of non-constant strided "
                                 "pointer induction variables."));
 
-static cl::opt<bool>
-    EnableEarlyExitVectorization("enable-early-exit-vectorization",
-                                 cl::init(false), cl::Hidden, cl::desc(""));
-
 namespace llvm {
 cl::opt<bool>
     HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
@@ -1381,7 +1377,7 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
   // When vectorizing early exits, create predicates for all blocks, except the
   // header.
-  if (canVectorizeEarlyExit() && BB != TheLoop->getHeader())
+  if (hasUncountableEarlyExit() && BB != TheLoop->getHeader())
     return true;
   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
 }
@@ -1523,8 +1519,6 @@ bool LoopVectorizationLegality::canVectorizeEarlyExit() const {
   // Currently only allow vectorizing loops with early exits, if early-exit
   // vectorization is explicitly enabled and the loop has metadata to force
   // vectorization.
-  if (!EnableEarlyExitVectorization)
-    return false;
 
   SmallVector<BasicBlock *> Exiting;
   TheLoop->getExitingBlocks(Exiting);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 60607b7cf6b46c..47e78a916f8cea 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -384,6 +384,11 @@ static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
     cl::Hidden,
     cl::desc("Try wider VFs if they enable the use of vector variants"));
 
+static cl::opt<bool> EnableEarlyExitVectorization(
+    "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Enable vectorization of early exit loops with uncountable exits."));
+
 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
 // variables not overflowing do not hold. See `emitSCEVChecks`.
 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
@@ -1358,14 +1363,13 @@ class LoopVectorizationCostModel {
       LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
       return false;
     }
-    // If we might exit from anywhere but the latch, must run the exiting
-    // iteration in scalar form.
-    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-      if (!Legal->canVectorizeEarlyExit()) {
-        LLVM_DEBUG(
-            dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
-        return true;
-      }
+    // If we might exit from anywhere but the latch and early exit vectorization
+    // is disabled, we must run the exiting iteration in scalar form.
+    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
+        !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
+      LLVM_DEBUG(
+          dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+      return true;
     }
     if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
       LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
@@ -2576,7 +2580,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   assert(LoopVectorPreHeader && "Invalid loop structure");
   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector()) ||
-          Legal->canVectorizeEarlyExit()) &&
+          Legal->hasUncountableEarlyExit()) &&
          "multiple exit loop without required epilogue?");
 
   LoopMiddleBlock =
@@ -2809,6 +2813,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
     }
   }
 
+  assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) &&
+         "Expected a single exit block for escaping values");
   for (auto &I : MissingVals) {
     PHINode *PHI = cast<PHINode>(I.first);
     // One corner case we have to handle is two IVs "chasing" each-other,
@@ -2819,9 +2825,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
       PHI->addIncoming(I.second, MiddleBlock);
   }
-
-  assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) &&
-         "Expected a single exit block");
 }
 
 namespace {
@@ -3597,7 +3600,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   for (BasicBlock *E : Exiting) {
     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
     if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() &&
-        (TheLoop->getLoopLatch() == E || !Legal->canVectorizeEarlyExit()))
+        (TheLoop->getLoopLatch() == E || !Legal->hasUncountableEarlyExit()))
       AddToWorklistIfAllowed(Cmp);
   }
 
@@ -8144,7 +8147,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   // If source is an exiting block, we know the exit edge is dynamically dead
   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
   // adding uses of an otherwise potentially dead instruction.
-  if (!Legal->canVectorizeEarlyExit() && OrigLoop->isLoopExiting(Src))
+  if (!Legal->hasUncountableEarlyExit() && OrigLoop->isLoopExiting(Src))
     return EdgeMaskCache[Edge] = SrcMask;
 
   VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8835,39 +8838,43 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
 static SetVector<VPIRInstruction *> collectUsersInExitBlock(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
-  auto *MiddleVPBB = Plan.getMiddleBlock();
-  // No edge from the middle block to the unique exit block has been inserted
-  // and there is nothing to fix from vector loop; phis should have incoming
-  // from scalar loop only.
-  if (MiddleVPBB->getNumSuccessors() != 2)
-    return {};
   SetVector<VPIRInstruction *> ExitUsersToFix;
-  VPBasicBlock *ExitVPBB = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
-  BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
-  for (VPRecipeBase &R : *ExitVPBB) {
-    auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
-    if (!ExitIRI)
-      continue;
-    auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
-    if (!ExitPhi)
-      break;
-    Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
-    VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
-    // Exit values for inductions are computed and updated outside of VPlan and
-    // independent of induction recipes.
-    // TODO: Compute induction exit values in VPlan.
-    if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
-         !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
-        isa<VPWidenPointerInductionRecipe>(V) ||
-        (isa<Instruction>(IncomingValue) &&
-         OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
-         any_of(IncomingValue->users(), [&Inductions](User *U) {
-           auto *P = dyn_cast<PHINode>(U);
-           return P && Inductions.contains(P);
-         })))
+  for (VPBlockBase *VPB : vp_depth_first_shallow(
+           Plan.getVectorLoopRegion()->getSingleSuccessor())) {
+    if (VPB->getNumSuccessors() != 0 || VPB == Plan.getScalarHeader())
       continue;
-    ExitUsersToFix.insert(ExitIRI);
-    ExitIRI->addOperand(V);
+    auto *ExitVPBB = cast<VPIRBasicBlock>(VPB);
+    BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
+    BasicBlock *ExitingBB = find_singleton<BasicBlock>(
+        to_vector(predecessors(ExitBB)),
+        [OrigLoop](BasicBlock *Pred, bool AllowRepeats) {
+          return OrigLoop->contains(Pred) ? Pred : nullptr;
+        });
+    for (VPRecipeBase &R : *ExitVPBB) {
+      auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
+      if (!ExitIRI)
+        continue;
+      auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
+      if (!ExitPhi)
+        break;
+      Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
+      VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
+      // Exit values for inductions are computed and updated outside of VPlan
+      // and independent of induction recipes.
+      // TODO: Compute induction exit values in VPlan.
+      if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
+           !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
+          isa<VPWidenPointerInductionRecipe>(V) ||
+          (isa<Instruction>(IncomingValue) &&
+           OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
+           any_of(IncomingValue->users(), [&Inductions](User *U) {
+             auto *P = dyn_cast<PHINode>(U);
+             return P && Inductions.contains(P);
+           })))
+        continue;
+      ExitUsersToFix.insert(ExitIRI);
+      ExitIRI->addOperand(V);
+    }
   }
   return ExitUsersToFix;
 }
@@ -9168,16 +9175,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
          "VPBasicBlock");
   RecipeBuilder.fixHeaderPhis();
 
-  if (Legal->canVectorizeEarlyExit()) {
-    VPlanTransforms::convertToMultiCond(*Plan, *PSE.getSE(), OrigLoop,
-                                        RecipeBuilder);
-  } else {
+  if (Legal->hasUncountableEarlyExit()) {
+    VPlanTransforms::handleUncountableEarlyExit(*Plan, *PSE.getSE(), OrigLoop,
+                                                RecipeBuilder);
+  }
     addScalarResumePhis(RecipeBuilder, *Plan);
     SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
         OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
     addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
     addUsersInExitBlock(*Plan, ExitUsersToFix);
-  }
 
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -9940,12 +9946,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   }
 
   if (LVL.hasUncountableEarlyExit()) {
-    reportVectorizationFailure("Auto-vectorization of loops with uncountable "
-                               "early exit is not yet supported",
-                               "Auto-vectorization of loops with uncountable "
-                               "early exit is not yet supported",
-                               "UncountableEarlyExitLoopsUnsupported", ORE, L);
-    return false;
+    if (!EnableEarlyExitVectorization) {
+      reportVectorizationFailure("Auto-vectorization of loops with uncountable "
+                                 "early exit is not yet supported",
+                                 "Auto-vectorization of loops with uncountable "
+                                 "early exit is not yet supported",
+                                 "UncountableEarlyExitLoopsUnsupported", ORE,
+                                 L);
+      return false;
+    }
   }
 
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index ed2d6aa5f3df4b..1c609d89c47b9b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -878,15 +878,9 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   auto Plan = std::make_unique<VPlan>(Entry, VecPreheader, ScalarHeader);
 
   // Create SCEV and VPValue for the trip count.
-
-  // Currently only loops with countable exits are vectorized, but calling
-  // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
-  // uncountable exits whilst also ensuring the symbolic maximum and known
-  // back-edge taken count remain identical for loops with countable exits.
+  // We use the symbolic max backedge-taken-count, which is used when
+  // vectorizing loops with uncountable early exits
   const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
-  assert((!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
-          BackedgeTakenCountSCEV == PSE.getBackedgeTakenCount()) &&
-         "Invalid loop count");
   ScalarEvolution &SE = *PSE.getSE();
   const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
                                                        InductionTy, TheLoop);
@@ -922,6 +916,9 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   // 3) Otherwise, construct a runtime check.
   BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
   if (!IRExitBlock) {
+    // If there's no unique exit block (i.e. vectorizing with an uncountable
+    // early exit), use the block exiting from the latch. The other uncountable
+    // exit blocks will be added later.
     auto *Term = cast<BranchInst>(TheLoop->getLoopLatch()->getTerminator());
     IRExitBlock = TheLoop->contains(Term->getSuccessor(0))
                       ? Term->getSuccessor(1)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1369693b01971c..d336827b23ddf3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1791,51 +1791,33 @@ void VPlanTransforms::createInterleaveGroups(
   }
 }
 
-void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
-                                         Loop *OrigLoop,
-                                         VPRecipeBuilder &RecipeBuilder) {
+void VPlanTransforms::handleUncountableEarlyExit(
+    VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
+    VPRecipeBuilder &RecipeBuilder) {
   auto *LatchVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
   VPBuilder Builder(LatchVPBB->getTerminator());
   auto *MiddleVPBB = Plan.getMiddleBlock();
-
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
 
-  const SCEV *BackedgeTakenCount =
-      SE.getExitCount(OrigLoop, OrigLoop->getLoopLatch());
-  const SCEV *TripCount = SE.getTripCountFromExitCount(
-      BackedgeTakenCount, Plan.getCanonicalIV()->getScalarType(), OrigLoop);
-  VPValue *NewTC = vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE);
-  Plan.getTripCount()->replaceAllUsesWith(NewTC);
-  Plan.resetTripCount(NewTC);
-
   VPValue *EarlyExitTaken = nullptr;
   SmallVector<BasicBlock *> ExitingBBs;
   OrigLoop->getExitingBlocks(ExitingBBs);
+
+  // Process all uncountable exiting blocks. For each exiting block, update the
+  // EarlyExitTaken, which tracks if any uncountable early exit has been taken.
+  // Also split the middle block and branch to the exit block for the early exit
+  // if it has been taken.
   for (BasicBlock *Exiting : ExitingBBs) {
+    if (Exiting == OrigLoop->getLoopLatch())
+      continue;
+
     auto *ExitingTerm = cast<BranchInst>(Exiting->getTerminator());
     BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
     BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
     VPIRBasicBlock *VPExitBlock;
-    if (OrigLoop->getUniqueExitBlock() || Exiting == OrigLoop->getLoopLatch())
-      VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
-    else
-      VPExitBlock = VPIRBasicBlock::fromBasicBlock(
-          !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
-
-    for (VPRecipeBase &R : *VPExitBlock) {
-      auto *ExitIRI = cast<VPIRInstruction>(&R);
-      auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
-      if (!ExitPhi)
-        break;
-      Value *IncomingValue = ExitPhi->getIncomingValueForBlock(Exiting);
-      VPValue *V = RecipeBuilder.getVPValueOrAddLiveIn(IncomingValue);
-      ExitIRI->addOperand(V);
-    }
-
-    if (Exiting == OrigLoop->getLoopLatch()) {
-      continue;
-    }
+    VPExitBlock = VPIRBasicBlock::fromBasicBlock(
+        !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
 
     VPValue *M = RecipeBuilder.getBlockInMask(
         OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
@@ -1851,6 +1833,10 @@ void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
     VPBuilder MiddleBuilder(NewMiddle);
     MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {EarlyExitTaken});
   }
+
+  // Replace the condition controlling the exit from the vector loop with one
+  // exiting if either the original condition of the vector latch is true or any
+  // early exit has been taken.
   auto *Term = dyn_cast<VPInstruction>(LatchVPBB->getTerminator());
   auto *IsLatchExiting = Builder.createICmp(
       CmpInst::ICMP_EQ, Term->getOperand(0), Term->getOperand(1));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index b5b7144568ff54..f7bbae25279fce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -124,9 +124,9 @@ struct VPlanTransforms {
   /// Remove dead recipes from \p Plan.
   static void removeDeadRecipes(VPlan &Plan);
 
-  static void convertToMultiCond(VPlan &Plan, ScalarEvolution &SE,
-                                 Loop *OrigLoop,
-                                 VPRecipeBuilder &RecipeBuilder);
+  static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE,
+                                         Loop *OrigLoop,
+                                         VPRecipeBuilder &RecipeBuilder);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
index 47304c571bfcb1..5da97ef8f9b3bc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
 
+declare void @init(ptr)
+
 define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
 ; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
 ; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
@@ -57,16 +59,18 @@ define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
 entry:
+  %src = alloca [128 x i32]
+  call void @init(ptr %src)
   br label %loop.header
 
 loop.header:
   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-  %c.1 = icmp uge i64 %iv, %N
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src
+  %c.1 = icmp eq i32 %l, 10
   br i1 %c.1, label %e1, label %loop.latch
 
 loop.latch:
-  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
-  store i32 0, ptr %arrayidx
   %inc = add nuw i64 %iv, 1
   %c.2 = icmp eq i64 %inc, 128
   br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
@@ -80,78 +84,6 @@ e2:
   ret i64 %p2
 }
 
-define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
-; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<128> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
-; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
-; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
-; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
-; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
-; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
-; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
-; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
-; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
-; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.split
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.split:
-; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
-; CHECK-NEXT: Successor(s): ir-bb<e>, middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e>:
-; CHECK-NEXT:    IR   %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<0>, ir<1>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
-; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<e>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: ir-bb<loop.header>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-; CHECK-NEXT:   IR   %c.1 = icmp uge i64 %iv, %N
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-;
-entry:
-  br label %loop.header
-
-loop.header:
-  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-  %c.1 = icmp uge i64 %iv, %N
-  br i1 %c.1, label %e, label %loop.latch
-
-loop.latch:
-  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
-  store i32 0, ptr %arrayidx
-  %inc = add nuw i64 %iv, 1
-  %c.2 = icmp eq i64 %inc, 128
-  br i1 %c.2, label %e, label %loop.header, !llvm.loop !1
-
-e:
-  %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
-  ret i64 %p1
-}
-
 !1 = distinct !{!1, !2, !3}
 !2 = !{!"llvm.loop.vectorize.width", i32 4}
 !3 = !{!"llvm.loop.vectorize.enable", i1 true}

>From 9212f9647692387fb4a6ff8cc6a90b1fa2b73628 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 5 Nov 2024 15:07:50 +0000
Subject: [PATCH 06/15] [VPlan] Generalize collectUsersInExitBlocks for
 multiple exit bbs.

Generalize collectUsersInExitBlock to collecting exit users in multiple
exit blocks. Exit blocks are leaf nodes in the VPlan (without
successors) except the scalar header.

Split off from https://github.com/llvm/llvm-project/pull/112138
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 47e78a916f8cea..4ccbc8e95d1edf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8830,12 +8830,12 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
   }
 }
 
-// Collect VPIRInstructions for phis in the original exit block that are modeled
+// Collect VPIRInstructions for phis in the exit blocks that are modeled
 // in VPlan and add the exiting VPValue as operand. Some exiting values are not
 // modeled explicitly yet and won't be included. Those are un-truncated
 // VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
 // increments.
-static SetVector<VPIRInstruction *> collectUsersInExitBlock(
+static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
   SetVector<VPIRInstruction *> ExitUsersToFix;
@@ -8882,8 +8882,8 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
 // Add exit values to \p Plan. Extracts are added for each entry in \p
 // ExitUsersToFix if needed and their operands are updated.
 static void
-addUsersInExitBlock(VPlan &Plan,
-                    const SetVector<VPIRInstruction *> &ExitUsersToFix) {
+addUsersInExitBlocks(VPlan &Plan,
+                     const SetVector<VPIRInstruction *> &ExitUsersToFix) {
   if (ExitUsersToFix.empty())
     return;
 
@@ -9179,12 +9179,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     VPlanTransforms::handleUncountableEarlyExit(*Plan, *PSE.getSE(), OrigLoop,
                                                 RecipeBuilder);
   }
-    addScalarResumePhis(RecipeBuilder, *Plan);
-    SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
-        OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
-    addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
-    addUsersInExitBlock(*Plan, ExitUsersToFix);
-
+  addScalarResumePhis(RecipeBuilder, *Plan);
+  SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
+      OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+  addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
+  addUsersInExitBlocks(*Plan, ExitUsersToFix);
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
   // bring the VPlan to its final state.

>From 5cb0851d69513eee7c14f21b598a014925ec6ae1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 5 Nov 2024 15:36:04 +0000
Subject: [PATCH 07/15] !fixup address comments

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  62 ++---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  10 +-
 .../LoopVectorize/X86/multi-exit-vplan.ll     |  89 -------
 .../X86/uncountable-early-exit-vplan.ll       | 244 ++++++++++++++++++
 4 files changed, 283 insertions(+), 122 deletions(-)
 delete mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4ccbc8e95d1edf..5188aeed2f32b5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8845,11 +8845,6 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
       continue;
     auto *ExitVPBB = cast<VPIRBasicBlock>(VPB);
     BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
-    BasicBlock *ExitingBB = find_singleton<BasicBlock>(
-        to_vector(predecessors(ExitBB)),
-        [OrigLoop](BasicBlock *Pred, bool AllowRepeats) {
-          return OrigLoop->contains(Pred) ? Pred : nullptr;
-        });
     for (VPRecipeBase &R : *ExitVPBB) {
       auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
       if (!ExitIRI)
@@ -8857,23 +8852,27 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
       auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
       if (!ExitPhi)
         break;
-      Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
-      VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
-      // Exit values for inductions are computed and updated outside of VPlan
-      // and independent of induction recipes.
-      // TODO: Compute induction exit values in VPlan.
-      if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
-           !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
-          isa<VPWidenPointerInductionRecipe>(V) ||
-          (isa<Instruction>(IncomingValue) &&
-           OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
-           any_of(IncomingValue->users(), [&Inductions](User *U) {
-             auto *P = dyn_cast<PHINode>(U);
-             return P && Inductions.contains(P);
-           })))
-        continue;
-      ExitUsersToFix.insert(ExitIRI);
-      ExitIRI->addOperand(V);
+      for (BasicBlock *ExitingBB : predecessors(ExitBB)) {
+        if (!OrigLoop->contains(ExitingBB))
+          continue;
+        Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
+        VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
+        // Exit values for inductions are computed and updated outside of VPlan
+        // and independent of induction recipes.
+        // TODO: Compute induction exit values in VPlan.
+        if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
+             !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
+            isa<VPWidenPointerInductionRecipe>(V) ||
+            (isa<Instruction>(IncomingValue) &&
+             OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
+             any_of(IncomingValue->users(), [&Inductions](User *U) {
+               auto *P = dyn_cast<PHINode>(U);
+               return P && Inductions.contains(P);
+             })))
+          continue;
+        ExitUsersToFix.insert(ExitIRI);
+        ExitIRI->addOperand(V);
+      }
     }
   }
   return ExitUsersToFix;
@@ -8887,23 +8886,26 @@ addUsersInExitBlocks(VPlan &Plan,
   if (ExitUsersToFix.empty())
     return;
 
-  auto *MiddleVPBB = Plan.getMiddleBlock();
-  VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
-
   // Introduce extract for exiting values and update the VPIRInstructions
   // modeling the corresponding LCSSA phis.
   for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
+
     VPValue *V = ExitIRI->getOperand(0);
     // Pass live-in values used by exit phis directly through to their users in
     // the exit block.
     if (V->isLiveIn())
       continue;
 
-    LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
-    VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
-                                  {V, Plan.getOrAddLiveIn(ConstantInt::get(
-                                          IntegerType::get(Ctx, 32), 1))});
-    ExitIRI->setOperand(0, Ext);
+    for (VPBlockBase *PredVPB : ExitIRI->getParent()->getPredecessors()) {
+      auto *PredVPBB = cast<VPBasicBlock>(PredVPB);
+      VPBuilder B(PredVPBB, PredVPBB->getFirstNonPhi());
+
+      LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
+      VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
+                                    {V, Plan.getOrAddLiveIn(ConstantInt::get(
+                                            IntegerType::get(Ctx, 32), 1))});
+      ExitIRI->setOperand(0, Ext);
+    }
   }
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d336827b23ddf3..661b100d7881c2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1815,9 +1815,13 @@ void VPlanTransforms::handleUncountableEarlyExit(
     auto *ExitingTerm = cast<BranchInst>(Exiting->getTerminator());
     BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
     BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
-    VPIRBasicBlock *VPExitBlock;
-    VPExitBlock = VPIRBasicBlock::fromBasicBlock(
-        !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+    VPBasicBlock *VPExitBlock;
+    if (OrigLoop->getUniqueExitBlock()) {
+      VPExitBlock = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[0]);
+    } else {
+      VPExitBlock = VPIRBasicBlock::fromBasicBlock(
+          !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+    }
 
     VPValue *M = RecipeBuilder.getBlockInMask(
         OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
deleted file mode 100644
index 5da97ef8f9b3bc..00000000000000
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
-
-declare void @init(ptr)
-
-define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) {
-; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<128> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
-; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
-; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
-; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
-; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
-; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
-; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
-; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
-; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
-; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.split
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.split:
-; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
-; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e1>:
-; CHECK-NEXT:   IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
-; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e2>:
-; CHECK-NEXT:  IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: ir-bb<loop.header>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-; CHECK-NEXT:   IR   %c.1 = icmp uge i64 %iv, %N
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-entry:
-  %src = alloca [128 x i32]
-  call void @init(ptr %src)
-  br label %loop.header
-
-loop.header:
-  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
-  %l = load i32, ptr %gep.src
-  %c.1 = icmp eq i32 %l, 10
-  br i1 %c.1, label %e1, label %loop.latch
-
-loop.latch:
-  %inc = add nuw i64 %iv, 1
-  %c.2 = icmp eq i64 %inc, 128
-  br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
-
-e1:
-  %p1 = phi i64 [ 0, %loop.header ]
-  ret i64 %p1
-
-e2:
-  %p2 = phi i64 [ 1, %loop.latch ]
-  ret i64 %p2
-}
-
-!1 = distinct !{!1, !2, !3}
-!2 = !{!"llvm.loop.vectorize.width", i32 4}
-!3 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll
new file mode 100644
index 00000000000000..13f5671f893651
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll
@@ -0,0 +1,244 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
+
+declare void @init(ptr)
+
+define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT:   IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT:  IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %c.1 = icmp uge i64 %iv, %N
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+entry:
+  %src = alloca [128 x i32]
+  call void @init(ptr %src)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src
+  %c.1 = icmp eq i32 %l, 10
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e2, label %loop.header
+
+e1:
+  %p1 = phi i64 [ 0, %loop.header ]
+  ret i64 %p1
+
+e2:
+  %p2 = phi i64 [ 1, %loop.latch ]
+  ret i64 %p2
+}
+
+define i64 @multi_exiting_to_different_exits_load_exit_value() {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT:   IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT:  IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %c.1 = icmp uge i64 %iv, %N
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+entry:
+  %src = alloca [128 x i64]
+  call void @init(ptr %src)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+  %l = load i64, ptr %gep.src
+  %c.1 = icmp eq i64 %l, 10
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e2, label %loop.header
+
+e1:
+  %p1 = phi i64 [ %l, %loop.header ]
+  ret i64 %p1
+
+e2:
+  %p2 = phi i64 [ 1, %loop.latch ]
+  ret i64 %p2
+}
+
+define i64 @multi_exiting_to_same_exit_load_exit_value() {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
+; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
+; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT:   IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT:  IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK-NEXT:   IR   %c.1 = icmp uge i64 %iv, %N
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+entry:
+  %src = alloca [128 x i64]
+  call void @init(ptr %src)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+  %l = load i64, ptr %gep.src
+  %l.2 = load i64, ptr %gep.src
+  %c.1 = icmp eq i64 %l, 10
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e1, label %loop.header
+
+e1:
+  %p1 = phi i64 [ %l, %loop.header ], [ %l.2, %loop.latch ]
+  ret i64 %p1
+}

>From e849195a4994293aab10fb9786dc006064374ac0 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 5 Nov 2024 20:48:45 +0000
Subject: [PATCH 08/15] !fixup address more comments.

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  27 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   4 +
 llvm/lib/Transforms/Vectorize/VPlanCFG.h      |   9 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   1 -
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  13 +-
 .../Transforms/Vectorize/VPlanTransforms.h    |  13 +-
 .../LoopVectorize/X86/multi-exit-codegen.ll   | 240 -----------------
 .../X86/uncountable-early-exit-vplan.ll       | 244 ------------------
 .../uncountable-early-exit-vplan.ll           | 171 ++++++++++++
 9 files changed, 218 insertions(+), 504 deletions(-)
 delete mode 100644 llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
 delete mode 100644 llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5188aeed2f32b5..1c02db88c3f3c8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8839,11 +8839,7 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
   SetVector<VPIRInstruction *> ExitUsersToFix;
-  for (VPBlockBase *VPB : vp_depth_first_shallow(
-           Plan.getVectorLoopRegion()->getSingleSuccessor())) {
-    if (VPB->getNumSuccessors() != 0 || VPB == Plan.getScalarHeader())
-      continue;
-    auto *ExitVPBB = cast<VPIRBasicBlock>(VPB);
+  for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
     BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
     for (VPRecipeBase &R : *ExitVPBB) {
       auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
@@ -9178,14 +9174,31 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   RecipeBuilder.fixHeaderPhis();
 
   if (Legal->hasUncountableEarlyExit()) {
-    VPlanTransforms::handleUncountableEarlyExit(*Plan, *PSE.getSE(), OrigLoop,
-                                                RecipeBuilder);
+    VPlanTransforms::handleUncountableEarlyExit(
+        *Plan, *PSE.getSE(), OrigLoop, Legal->getUncountableExitingBlocks(),
+        RecipeBuilder);
   }
   addScalarResumePhis(RecipeBuilder, *Plan);
   SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
       OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
   addUsersInExitBlocks(*Plan, ExitUsersToFix);
+
+  // Currently only live-ins can be used by exit values. We also bail out if any
+  // exit value isn't handled in VPlan yet, i.e. a VPIRInstruction in the exit
+  // without any operands.
+  if (Legal->hasUncountableEarlyExit()) {
+    if (any_of(Plan->getExitBlocks(), [](VPIRBasicBlock *ExitBB) {
+          return any_of(*ExitBB, [](VPRecipeBase &R) {
+            auto VPIRI = cast<VPIRInstruction>(&R);
+            return VPIRI->getNumOperands() == 0 ||
+                   any_of(VPIRI->operands(),
+                          [](VPValue *Op) { return !Op->isLiveIn(); });
+          });
+        }))
+      return nullptr;
+  }
+
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
   // bring the VPlan to its final state.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 44fd49fed6ad90..34a3a180bfa52a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3832,6 +3832,10 @@ class VPlan {
     return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
   }
 
+  /// Return the exit blocks of the VPlan, that is leaf nodes except the scalar
+  /// header.
+  auto getExitBlocks();
+
   /// The trip count of the original loop.
   VPValue *getTripCount() const {
     assert(TripCount && "trip count needs to be set before accessing it");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanCFG.h b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
index 89e2e7514dac2b..6ca388a953a6ff 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanCFG.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
@@ -306,6 +306,15 @@ template <> struct GraphTraits<VPlan *> {
   }
 };
 
+inline auto VPlan::getExitBlocks() {
+  VPBlockBase *ScalarHeader = getScalarHeader();
+  return make_filter_range(
+      VPBlockUtils::blocksOnly<VPIRBasicBlock>(
+          vp_depth_first_shallow(getVectorLoopRegion()->getSingleSuccessor())),
+      [ScalarHeader](VPIRBasicBlock *VPIRBB) {
+        return VPIRBB != ScalarHeader && VPIRBB->getNumSuccessors() == 0;
+      });
+}
 } // namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLANCFG_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7f582ce8c99433..a1a0c2ffcf0597 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -872,7 +872,6 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
   O << Indent << "IR " << I;
 
   if (getNumOperands() != 0) {
-    // assert(getNumOperands() == 1 && "can have at most 1 operand");
     O << " (extra operand: ";
     printOperands(O, SlotTracker);
     O << ")";
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 661b100d7881c2..ed37e492b34fb1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1793,31 +1793,26 @@ void VPlanTransforms::createInterleaveGroups(
 
 void VPlanTransforms::handleUncountableEarlyExit(
     VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
+    ArrayRef<BasicBlock *> UncountableExitingBlocks,
     VPRecipeBuilder &RecipeBuilder) {
   auto *LatchVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
   VPBuilder Builder(LatchVPBB->getTerminator());
   auto *MiddleVPBB = Plan.getMiddleBlock();
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
-
   VPValue *EarlyExitTaken = nullptr;
-  SmallVector<BasicBlock *> ExitingBBs;
-  OrigLoop->getExitingBlocks(ExitingBBs);
 
   // Process all uncountable exiting blocks. For each exiting block, update the
   // EarlyExitTaken, which tracks if any uncountable early exit has been taken.
   // Also split the middle block and branch to the exit block for the early exit
   // if it has been taken.
-  for (BasicBlock *Exiting : ExitingBBs) {
-    if (Exiting == OrigLoop->getLoopLatch())
-      continue;
-
+  for (BasicBlock *Exiting : UncountableExitingBlocks) {
     auto *ExitingTerm = cast<BranchInst>(Exiting->getTerminator());
     BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
     BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
-    VPBasicBlock *VPExitBlock;
+    VPIRBasicBlock *VPExitBlock;
     if (OrigLoop->getUniqueExitBlock()) {
-      VPExitBlock = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[0]);
+      VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
     } else {
       VPExitBlock = VPIRBasicBlock::fromBasicBlock(
           !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index f7bbae25279fce..dc5dec2f1b84a6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -124,9 +124,16 @@ struct VPlanTransforms {
   /// Remove dead recipes from \p Plan.
   static void removeDeadRecipes(VPlan &Plan);
 
-  static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE,
-                                         Loop *OrigLoop,
-                                         VPRecipeBuilder &RecipeBuilder);
+  /// Update \p Plan to account for uncountable exit blocks in \p
+  /// UncountableExitingBlocks by
+  ///  * updating the condition to exit the vector loop to include the early
+  ///  exit conditions
+  ///  * splitting the original middle block to branch to the early exit blocks
+  ///  if taken. Returns false if the transformation wasn't successful.
+  static void
+  handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
+                             ArrayRef<BasicBlock *> UncountableExitingBlocks,
+                             VPRecipeBuilder &RecipeBuilder);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
deleted file mode 100644
index 0c33715c6bd271..00000000000000
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll
+++ /dev/null
@@ -1,240 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization %s | FileCheck --check-prefix=MULTI %s
-; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization=false %s | FileCheck --check-prefix=DEFAULT %s
-
-define i64 @multi_exit_with_store(ptr %p, i64 %N) {
-; MULTI-LABEL: define i64 @multi_exit_with_store(
-; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; MULTI-NEXT:  [[ENTRY:.*]]:
-; MULTI-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; MULTI:       [[VECTOR_PH]]:
-; MULTI-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
-; MULTI-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; MULTI-NEXT:    br label %[[VECTOR_BODY:.*]]
-; MULTI:       [[VECTOR_BODY]]:
-; MULTI-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; MULTI-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; MULTI-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; MULTI-NEXT:    [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; MULTI-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; MULTI-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
-; MULTI-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
-; MULTI-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]])
-; MULTI-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; MULTI-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
-; MULTI-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; MULTI-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; MULTI-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; MULTI-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
-; MULTI-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; MULTI:       [[MIDDLE_SPLIT]]:
-; MULTI-NEXT:    br i1 [[TMP6]], label %[[E1:.*]], label %[[MIDDLE_BLOCK:.*]]
-; MULTI:       [[MIDDLE_BLOCK]]:
-; MULTI-NEXT:    br i1 true, label %[[E2:.*]], label %[[SCALAR_PH]]
-; MULTI:       [[SCALAR_PH]]:
-; MULTI-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; MULTI-NEXT:    br label %[[LOOP_HEADER:.*]]
-; MULTI:       [[LOOP_HEADER]]:
-; MULTI-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; MULTI-NEXT:    [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
-; MULTI-NEXT:    br i1 [[CMP1]], label %[[E1]], label %[[LOOP_LATCH]]
-; MULTI:       [[LOOP_LATCH]]:
-; MULTI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
-; MULTI-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
-; MULTI-NEXT:    [[INC]] = add nuw i64 [[I_07]], 1
-; MULTI-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
-; MULTI-NEXT:    br i1 [[CMP_NOT]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
-; MULTI:       [[E1]]:
-; MULTI-NEXT:    ret i64 0
-; MULTI:       [[E2]]:
-; MULTI-NEXT:    ret i64 1
-;
-; DEFAULT-LABEL: define i64 @multi_exit_with_store(
-; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; DEFAULT-NEXT:  [[ENTRY:.*]]:
-; DEFAULT-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
-; DEFAULT-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1
-; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 4
-; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; DEFAULT:       [[VECTOR_PH]]:
-; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
-; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; DEFAULT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP5]], i64 4, i64 [[N_MOD_VF]]
-; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP2]]
-; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
-; DEFAULT:       [[VECTOR_BODY]]:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]]
-; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
-; DEFAULT-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP3]], align 4
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; DEFAULT:       [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT:    br label %[[SCALAR_PH]]
-; DEFAULT:       [[SCALAR_PH]]:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; DEFAULT-NEXT:    br label %[[LOOP_HEADER:.*]]
-; DEFAULT:       [[LOOP_HEADER]]:
-; DEFAULT-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; DEFAULT-NEXT:    [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]]
-; DEFAULT-NEXT:    br i1 [[CMP1]], label %[[E1:.*]], label %[[LOOP_LATCH]]
-; DEFAULT:       [[LOOP_LATCH]]:
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]]
-; DEFAULT-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
-; DEFAULT-NEXT:    [[INC]] = add nuw i64 [[I_07]], 1
-; DEFAULT-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128
-; DEFAULT-NEXT:    br i1 [[CMP_NOT]], label %[[E2:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
-; DEFAULT:       [[E1]]:
-; DEFAULT-NEXT:    ret i64 0
-; DEFAULT:       [[E2]]:
-; DEFAULT-NEXT:    ret i64 1
-;
-entry:
-  br label %loop.header
-
-loop.header:
-  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-  %c.1 = icmp uge i64 %iv, %N
-  br i1 %c.1, label %e1, label %loop.latch
-
-loop.latch:
-  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
-  store i32 0, ptr %arrayidx
-  %inc = add nuw i64 %iv, 1
-  %c.2 = icmp eq i64 %inc, 128
-  br i1 %c.2, label %e2, label %loop.header, !llvm.loop !1
-
-e1:
-  ret i64 0
-
-e2:
-  ret i64 1
-}
-
-define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) {
-; MULTI-LABEL: define i64 @multi_exiting_to_same_exit_with_store(
-; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; MULTI-NEXT:  [[ENTRY:.*]]:
-; MULTI-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; MULTI:       [[VECTOR_PH]]:
-; MULTI-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
-; MULTI-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; MULTI-NEXT:    br label %[[VECTOR_BODY:.*]]
-; MULTI:       [[VECTOR_BODY]]:
-; MULTI-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; MULTI-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; MULTI-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; MULTI-NEXT:    [[TMP1:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; MULTI-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; MULTI-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]]
-; MULTI-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
-; MULTI-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP4]], i32 4, <4 x i1> [[TMP2]])
-; MULTI-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; MULTI-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
-; MULTI-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; MULTI-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; MULTI-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; MULTI-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
-; MULTI-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; MULTI:       [[MIDDLE_SPLIT]]:
-; MULTI-NEXT:    br i1 [[TMP6]], label %[[E:.*]], label %[[MIDDLE_BLOCK:.*]]
-; MULTI:       [[MIDDLE_BLOCK]]:
-; MULTI-NEXT:    br i1 true, label %[[E]], label %[[SCALAR_PH]]
-; MULTI:       [[SCALAR_PH]]:
-; MULTI-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; MULTI-NEXT:    br label %[[LOOP_HEADER:.*]]
-; MULTI:       [[LOOP_HEADER]]:
-; MULTI-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; MULTI-NEXT:    [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]]
-; MULTI-NEXT:    br i1 [[C_1]], label %[[E]], label %[[LOOP_LATCH]]
-; MULTI:       [[LOOP_LATCH]]:
-; MULTI-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
-; MULTI-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
-; MULTI-NEXT:    [[INC]] = add nuw i64 [[IV]], 1
-; MULTI-NEXT:    [[C_2:%.*]] = icmp eq i64 [[INC]], 128
-; MULTI-NEXT:    br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
-; MULTI:       [[E]]:
-; MULTI-NEXT:    [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ 1, %[[MIDDLE_SPLIT]] ]
-; MULTI-NEXT:    ret i64 [[P1]]
-;
-; DEFAULT-LABEL: define i64 @multi_exiting_to_same_exit_with_store(
-; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; DEFAULT-NEXT:  [[ENTRY:.*]]:
-; DEFAULT-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127)
-; DEFAULT-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[UMIN]], 1
-; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
-; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; DEFAULT:       [[VECTOR_PH]]:
-; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
-; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; DEFAULT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]]
-; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
-; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
-; DEFAULT:       [[VECTOR_BODY]]:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; DEFAULT-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP3]]
-; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; DEFAULT-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; DEFAULT:       [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT:    br label %[[SCALAR_PH]]
-; DEFAULT:       [[SCALAR_PH]]:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; DEFAULT-NEXT:    br label %[[LOOP_HEADER:.*]]
-; DEFAULT:       [[LOOP_HEADER]]:
-; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; DEFAULT-NEXT:    [[C_1:%.*]] = icmp uge i64 [[IV]], [[N]]
-; DEFAULT-NEXT:    br i1 [[C_1]], label %[[E:.*]], label %[[LOOP_LATCH]]
-; DEFAULT:       [[LOOP_LATCH]]:
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
-; DEFAULT-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4
-; DEFAULT-NEXT:    [[INC]] = add nuw i64 [[IV]], 1
-; DEFAULT-NEXT:    [[C_2:%.*]] = icmp eq i64 [[INC]], 128
-; DEFAULT-NEXT:    br i1 [[C_2]], label %[[E]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
-; DEFAULT:       [[E]]:
-; DEFAULT-NEXT:    [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ]
-; DEFAULT-NEXT:    ret i64 [[P1]]
-;
-entry:
-  br label %loop.header
-
-loop.header:
-  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-  %c.1 = icmp uge i64 %iv, %N
-  br i1 %c.1, label %e, label %loop.latch
-
-loop.latch:
-  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
-  store i32 0, ptr %arrayidx
-  %inc = add nuw i64 %iv, 1
-  %c.2 = icmp eq i64 %inc, 128
-  br i1 %c.2, label %e, label %loop.header, !llvm.loop !1
-
-e:
-  %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
-  ret i64 %p1
-}
-
-!1 = distinct !{!1, !2, !3}
-!2 = !{!"llvm.loop.vectorize.width", i32 4}
-!3 = !{!"llvm.loop.vectorize.enable", i1 true}
-;.
-; MULTI: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; MULTI: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; MULTI: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; MULTI: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; MULTI: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; MULTI: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-;.
-; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll
deleted file mode 100644
index 13f5671f893651..00000000000000
--- a/llvm/test/Transforms/LoopVectorize/X86/uncountable-early-exit-vplan.ll
+++ /dev/null
@@ -1,244 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
-
-declare void @init(ptr)
-
-define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
-; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<128> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
-; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
-; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
-; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
-; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
-; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
-; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
-; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
-; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
-; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.split
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.split:
-; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
-; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e1>:
-; CHECK-NEXT:   IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
-; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e2>:
-; CHECK-NEXT:  IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: ir-bb<loop.header>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-; CHECK-NEXT:   IR   %c.1 = icmp uge i64 %iv, %N
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-entry:
-  %src = alloca [128 x i32]
-  call void @init(ptr %src)
-  br label %loop.header
-
-loop.header:
-  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
-  %l = load i32, ptr %gep.src
-  %c.1 = icmp eq i32 %l, 10
-  br i1 %c.1, label %e1, label %loop.latch
-
-loop.latch:
-  %inc = add nuw i64 %iv, 1
-  %c.2 = icmp eq i64 %inc, 128
-  br i1 %c.2, label %e2, label %loop.header
-
-e1:
-  %p1 = phi i64 [ 0, %loop.header ]
-  ret i64 %p1
-
-e2:
-  %p2 = phi i64 [ 1, %loop.latch ]
-  ret i64 %p2
-}
-
-define i64 @multi_exiting_to_different_exits_load_exit_value() {
-; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<128> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
-; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
-; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
-; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
-; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
-; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
-; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
-; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
-; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
-; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.split
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.split:
-; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
-; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e1>:
-; CHECK-NEXT:   IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
-; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e2>:
-; CHECK-NEXT:  IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: ir-bb<loop.header>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-; CHECK-NEXT:   IR   %c.1 = icmp uge i64 %iv, %N
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-entry:
-  %src = alloca [128 x i64]
-  call void @init(ptr %src)
-  br label %loop.header
-
-loop.header:
-  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-  %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
-  %l = load i64, ptr %gep.src
-  %c.1 = icmp eq i64 %l, 10
-  br i1 %c.1, label %e1, label %loop.latch
-
-loop.latch:
-  %inc = add nuw i64 %iv, 1
-  %c.2 = icmp eq i64 %inc, 128
-  br i1 %c.2, label %e2, label %loop.header
-
-e1:
-  %p1 = phi i64 [ %l, %loop.header ]
-  ret i64 %p1
-
-e2:
-  %p2 = phi i64 [ 1, %loop.latch ]
-  ret i64 %p2
-}
-
-define i64 @multi_exiting_to_same_exit_load_exit_value() {
-; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
-; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<128> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<[[VF]]>
-; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:     WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N>
-; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
-; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<[[STEPS]]>
-; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%arrayidx>
-; CHECK-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<0>, vp<[[NOT1]]>
-; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
-; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
-; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
-; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
-; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
-; CHECK-NEXT:   No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.split
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.split:
-; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
-; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e1>:
-; CHECK-NEXT:   IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
-; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<e2>:
-; CHECK-NEXT:  IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT: ir-bb<loop.header>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<loop.header>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-; CHECK-NEXT:   IR   %c.1 = icmp uge i64 %iv, %N
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-entry:
-  %src = alloca [128 x i64]
-  call void @init(ptr %src)
-  br label %loop.header
-
-loop.header:
-  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
-  %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
-  %l = load i64, ptr %gep.src
-  %l.2 = load i64, ptr %gep.src
-  %c.1 = icmp eq i64 %l, 10
-  br i1 %c.1, label %e1, label %loop.latch
-
-loop.latch:
-  %inc = add nuw i64 %iv, 1
-  %c.2 = icmp eq i64 %inc, 128
-  br i1 %c.2, label %e1, label %loop.header
-
-e1:
-  %p1 = phi i64 [ %l, %loop.header ], [ %l.2, %loop.latch ]
-  ret i64 %p1
-}
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
new file mode 100644
index 00000000000000..d840646a259529
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S -enable-early-exit-vectorization -debug %s 2>&1 | FileCheck %s
+
+declare void @init(ptr)
+
+define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT:   IR %src = alloca [128 x i32], align 4
+; CHECK-NEXT:   IR call void @init(ptr %src)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:     CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<%3>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.src>
+; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:     WIDEN ir<%c.1> = icmp eq ir<%l>, ir<10>
+; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e1>:
+; CHECK-NEXT:   IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<e2>:
+; CHECK-NEXT:  IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK:      No successors
+; CHECK-NEXT: }
+entry:
+  %src = alloca [128 x i32]
+  call void @init(ptr %src)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src
+  %c.1 = icmp eq i32 %l, 10
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e2, label %loop.header
+
+e1:
+  %p1 = phi i64 [ 0, %loop.header ]
+  ret i64 %p1
+
+e2:
+  %p2 = phi i64 [ 1, %loop.latch ]
+  ret i64 %p2
+}
+
+define i64 @multi_exiting_to_different_exits_load_exit_value() {
+; CHECK-NOT: VPlan 'Final VPlan for VF={4},UF={1}' {
+entry:
+  %src = alloca [128 x i64]
+  call void @init(ptr %src)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+  %l = load i64, ptr %gep.src
+  %c.1 = icmp eq i64 %l, 10
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e2, label %loop.header
+
+e1:
+  %p1 = phi i64 [ %l, %loop.header ]
+  ret i64 %p1
+
+e2:
+  %p2 = phi i64 [ 1, %loop.latch ]
+  ret i64 %p2
+}
+
+define i64 @multi_exiting_to_same_exit_load_exit_value() {
+; CHECK-NOT: VPlan 'Final VPlan for VF={4},UF={1}' {
+
+entry:
+  %src = alloca [128 x i64]
+  call void @init(ptr %src)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+  %l = load i64, ptr %gep.src
+  %l.2 = load i64, ptr %gep.src
+  %c.1 = icmp eq i64 %l, 10
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e1, label %loop.header
+
+e1:
+  %p1 = phi i64 [ %l, %loop.header ], [ %l.2, %loop.latch ]
+  ret i64 %p1
+}
+
+define i64 @multi_exiting_to_different_exits_induction_exit_value() {
+; CHECK-NOT: VPlan 'Final VPlan for VF={4},UF={1}' {
+entry:
+  %src = alloca [128 x i64]
+  call void @init(ptr %src)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+  %l = load i64, ptr %gep.src
+  %c.1 = icmp eq i64 %l, 10
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e2, label %loop.header
+
+e1:
+  %p1 = phi i64 [ %iv, %loop.header ]
+  ret i64 %p1
+
+e2:
+  %p2 = phi i64 [ 1, %loop.latch ]
+  ret i64 %p2
+}
+
+

>From c53eca6a59f55edc3dcb2bb692956de2fd97bfce Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 6 Nov 2024 20:40:41 +0000
Subject: [PATCH 09/15] !fixup remove left over canVectorizeEarlyExit

---
 .../Vectorize/LoopVectorizationLegality.h     |  3 ---
 .../Vectorize/LoopVectorizationLegality.cpp   | 19 -------------------
 2 files changed, 22 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index af6fae44cf0f09..dc7e484a40a452 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -287,9 +287,6 @@ class LoopVectorizationLegality {
   /// we can use in-order reductions.
   bool canVectorizeFPMath(bool EnableStrictReductions);
 
-  /// Returns true if the loop has an early exit that we can vectorize.
-  bool canVectorizeEarlyExit() const;
-
   /// Return true if we can vectorize this loop while folding its tail by
   /// masking.
   bool canFoldTailByMasking() const;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index ed3808d2f30bf1..0267fb1adb16d6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1515,25 +1515,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   return true;
 }
 
-bool LoopVectorizationLegality::canVectorizeEarlyExit() const {
-  // Currently only allow vectorizing loops with early exits, if early-exit
-  // vectorization is explicitly enabled and the loop has metadata to force
-  // vectorization.
-
-  SmallVector<BasicBlock *> Exiting;
-  TheLoop->getExitingBlocks(Exiting);
-  if (Exiting.size() == 1)
-    return false;
-
-  LoopVectorizeHints Hints(TheLoop, true, *ORE);
-  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
-    return false;
-
-  Function *Fn = TheLoop->getHeader()->getParent();
-  return Hints.allowVectorization(Fn, TheLoop,
-                                  true /*VectorizeOnlyWhenForced*/);
-}
-
 // Helper function to canVectorizeLoopNestCFG.
 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
                                                     bool UseVPlanNativePath) {

>From e26af8e878c6484d1b7f95cb6f4c3c9966836fe2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 23 Nov 2024 12:56:49 +0000
Subject: [PATCH 10/15] !fixup address latest comments, thanks!

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  94 +++++------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  15 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   2 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   4 -
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  61 +++----
 .../Transforms/Vectorize/VPlanTransforms.h    |  12 +-
 .../LoopVectorize/early_exit_legality.ll      |   2 +-
 .../LoopVectorize/single_early_exit.ll        | 157 ++++++++++++++++--
 .../uncountable-early-exit-vplan.ll           | 156 +++++++++++++++++
 .../LoopVectorize/unsupported_early_exit.ll   |   2 +-
 10 files changed, 384 insertions(+), 121 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b59922fd3fd7c2..5134c4ec7d225c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7783,7 +7783,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     LoopVectorizeHints Hints(L, true, *ORE);
     Hints.setAlreadyVectorized();
   }
-
   TargetTransformInfo::UnrollingPreferences UP;
   TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
   if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
@@ -8882,9 +8881,9 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
 static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
     Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
     const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+  auto *MiddleVPBB = Plan.getMiddleBlock();
   SetVector<VPIRInstruction *> ExitUsersToFix;
   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
-    BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
     for (VPRecipeBase &R : *ExitVPBB) {
       auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
       if (!ExitIRI)
@@ -8892,26 +8891,34 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
       auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
       if (!ExitPhi)
         break;
-      for (BasicBlock *ExitingBB : predecessors(ExitBB)) {
-        if (!OrigLoop->contains(ExitingBB))
-          continue;
-      Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
-      VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
-      // Exit values for inductions are computed and updated outside of VPlan
-      // and independent of induction recipes.
-      // TODO: Compute induction exit values in VPlan.
-      if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
-           !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
-          isa<VPWidenPointerInductionRecipe>(V) ||
-          (isa<Instruction>(IncomingValue) &&
-           OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
-           any_of(IncomingValue->users(), [&Inductions](User *U) {
-             auto *P = dyn_cast<PHINode>(U);
-             return P && Inductions.contains(P);
-           })))
-        continue;
-      ExitUsersToFix.insert(ExitIRI);
-      ExitIRI->addOperand(V);
+      for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
+        BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
+        if (PredVPBB != MiddleVPBB) {
+          SmallVector<BasicBlock *> ExitingBlocks;
+          OrigLoop->getExitingBlocks(ExitingBlocks);
+          assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
+          ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
+                                                    : ExitingBlocks[0];
+        }
+        Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
+        VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
+        // Exit values for inductions are computed and updated outside of VPlan
+        // and independent of induction recipes.
+        // TODO: Compute induction exit values in VPlan.
+        if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
+             !cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
+            isa<VPWidenPointerInductionRecipe>(V) ||
+            (isa<Instruction>(IncomingValue) &&
+             OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
+             any_of(IncomingValue->users(), [&Inductions](User *U) {
+               auto *P = dyn_cast<PHINode>(U);
+               return P && Inductions.contains(P);
+             }))) {
+          if (ExitVPBB->getSinglePredecessor() == MiddleVPBB)
+            continue;
+        }
+        ExitUsersToFix.insert(ExitIRI);
+        ExitIRI->addOperand(V);
       }
     }
   }
@@ -8919,24 +8926,28 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
 }
 
 // Add exit values to \p Plan. Extracts are added for each entry in \p
-// ExitUsersToFix if needed and their operands are updated.
-static void
+// ExitUsersToFix if needed and their operands are updated. Returns true if all
+// exit users can be handled, otherwise return false.
+static bool
 addUsersInExitBlocks(VPlan &Plan,
                      const SetVector<VPIRInstruction *> &ExitUsersToFix) {
   if (ExitUsersToFix.empty())
-    return;
+    return true;
+
+  auto *MiddleVPBB = Plan.getMiddleBlock();
+  VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
 
   // Introduce extract for exiting values and update the VPIRInstructions
   // modeling the corresponding LCSSA phis.
   for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
-
     VPValue *V = ExitIRI->getOperand(0);
     // Pass live-in values used by exit phis directly through to their users in
     // the exit block.
     if (V->isLiveIn())
       continue;
 
-    // Currently only live-ins can be used by exit values from blocks not exiting via the vector latch through to the middle block.
+    // Currently only live-ins can be used by exit values from blocks not
+    // exiting via the vector latch through to the middle block.
     if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
       return false;
 
@@ -9218,31 +9229,19 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   RecipeBuilder.fixHeaderPhis();
 
   if (Legal->hasUncountableEarlyExit()) {
+    assert(Legal->getUncountableExitingBlocks().size() == 1 &&
+           "Only single uncountable exiting bock supported");
     VPlanTransforms::handleUncountableEarlyExit(
-        *Plan, *PSE.getSE(), OrigLoop, Legal->getUncountableExitingBlocks(),
+        *Plan, *PSE.getSE(), OrigLoop, Legal->getUncountableExitingBlocks()[0],
         RecipeBuilder);
   }
   addScalarResumePhis(RecipeBuilder, *Plan);
   SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
       OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
-  if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) 
-<<<<<<< HEAD
-
-  if (Legal->hasUncountableEarlyExit()) {
-    if (any_of(Plan->getExitBlocks(), [](VPIRBasicBlock *ExitBB) {
-          return any_of(*ExitBB, [](VPRecipeBase &R) {
-            auto VPIRI = cast<VPIRInstruction>(&R);
-            return VPIRI->getNumOperands() == 0 ||
-                   any_of(VPIRI->operands(),
-                          [](VPValue *Op) { return !Op->isLiveIn(); });
-          });
-        }))
-      return nullptr;
-  }
+  if (!addUsersInExitBlocks(*Plan, ExitUsersToFix))
+    return nullptr;
 
-=======
->>>>>>> origin/main
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
   // bring the VPlan to its final state.
@@ -9998,11 +9997,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   if (LVL.hasUncountableEarlyExit()) {
     if (!EnableEarlyExitVectorization) {
       reportVectorizationFailure("Auto-vectorization of loops with uncountable "
-                                 "early exit is not yet supported",
+                                 "early exit is not enabled",
                                  "Auto-vectorization of loops with uncountable "
-                                 "early exit is not yet supported",
-                                 "UncountableEarlyExitLoopsUnsupported", ORE,
-                                 L);
+                                 "early exit is no enabled",
+                                 "UncountableEarlyExitLoopsDisabled", ORE, L);
       return false;
     }
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index b99196961917eb..db8f0bfd60b4bc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -872,8 +872,10 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
 
   // Create SCEV and VPValue for the trip count.
   // We use the symbolic max backedge-taken-count, which is used when
-  // vectorizing loops with uncountable early exits
+  // vectorizing loops with uncountable early exits.
   const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
+  assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
+         "Invalid loop count");
   ScalarEvolution &SE = *PSE.getSE();
   const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
                                                        InductionTy, TheLoop);
@@ -907,16 +909,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
   // 2) If we require a scalar epilogue, there is no conditional branch as
   //    we unconditionally branch to the scalar preheader.  Do nothing.
   // 3) Otherwise, construct a runtime check.
-  BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
-  if (!IRExitBlock) {
-    // If there's no unique exit block (i.e. vectorizing with an uncountable
-    // early exit), use the block exiting from the latch. The other uncountable
-    // exit blocks will be added later.
-    auto *Term = cast<BranchInst>(TheLoop->getLoopLatch()->getTerminator());
-    IRExitBlock = TheLoop->contains(Term->getSuccessor(0))
-                      ? Term->getSuccessor(1)
-                      : Term->getSuccessor(0);
-  }
+  BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
   auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
   // The connection order corresponds to the operands of the conditional
   // branch.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9d159b9adfba0a..00c69dd53a3635 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1232,6 +1232,8 @@ class VPInstruction : public VPRecipeWithIRFlags,
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
     PtrAdd,
+    // Returns a scalar boolean value, which is true if any lane of its single
+    // operand is true.
     AnyOf,
   };
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index cc816b0cba22fb..5527ed3c1fad08 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -68,8 +68,6 @@ bool VPRecipeBase::mayWriteToMemory() const {
     default:
       return true;
     }
-  case VPExpandSCEVSC:
-    return getParent()->getPlan()->getTripCount() == getVPSingleValue();
   case VPInterleaveSC:
     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
   case VPWidenStoreEVLSC:
@@ -165,8 +163,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPScalarCastSC:
   case VPReverseVectorPointerSC:
     return false;
-  case VPExpandSCEVSC:
-    return getParent()->getPlan()->getTripCount() == getVPSingleValue();
   case VPInstructionSC:
     return mayWriteToMemory();
   case VPWidenCallSC: {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index fce63c340230e3..80089fa4dfa334 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -509,12 +509,6 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
       Plan.getEntry());
 
-  for (VPRecipeBase &R : make_early_inc_range(
-           reverse(*cast<VPBasicBlock>(Plan.getPreheader())))) {
-    if (isDeadRecipe(R))
-      R.eraseFromParent();
-  }
-
   for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
     // The recipes in the block are processed in reverse order, to catch chains
     // of dead recipes.
@@ -1809,8 +1803,7 @@ void VPlanTransforms::createInterleaveGroups(
 
 void VPlanTransforms::handleUncountableEarlyExit(
     VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
-    ArrayRef<BasicBlock *> UncountableExitingBlocks,
-    VPRecipeBuilder &RecipeBuilder) {
+    BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder) {
   auto *LatchVPBB =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
   VPBuilder Builder(LatchVPBB->getTerminator());
@@ -1818,36 +1811,34 @@ void VPlanTransforms::handleUncountableEarlyExit(
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   VPValue *EarlyExitTaken = nullptr;
 
-  // Process all uncountable exiting blocks. For each exiting block, update the
-  // EarlyExitTaken, which tracks if any uncountable early exit has been taken.
-  // Also split the middle block and branch to the exit block for the early exit
-  // if it has been taken.
-  for (BasicBlock *Exiting : UncountableExitingBlocks) {
-    auto *ExitingTerm = cast<BranchInst>(Exiting->getTerminator());
-    BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
-    BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
-    VPIRBasicBlock *VPExitBlock;
-    if (OrigLoop->getUniqueExitBlock()) {
-      VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
-    } else {
-      VPExitBlock = VPIRBasicBlock::fromBasicBlock(
-          !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
-    }
+  // Process the uncountable exiting block. Update EarlyExitTaken, which tracks
+  // if any uncountable early exit has been taken. Also split the middle block
+  // and branch to the exit block for the early exit if it has been taken.
+  auto *ExitingTerm =
+      cast<BranchInst>(UncountableExitingBlock->getTerminator());
+  BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0);
+  BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1);
+  VPIRBasicBlock *VPExitBlock;
+  if (OrigLoop->getUniqueExitBlock()) {
+    VPExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
+  } else {
+    VPExitBlock = VPIRBasicBlock::fromBasicBlock(
+        !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+  }
 
-    VPValue *M = RecipeBuilder.getBlockInMask(
-        OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
-    auto *N = Builder.createNot(M);
-    EarlyExitTaken = Builder.createNaryOp(VPInstruction::AnyOf, {N});
+  VPValue *M = RecipeBuilder.getBlockInMask(
+      OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+  auto *N = Builder.createNot(M);
+  EarlyExitTaken = Builder.createNaryOp(VPInstruction::AnyOf, {N});
 
-    VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split");
-    VPBlockUtils::disconnectBlocks(LoopRegion, MiddleVPBB);
-    VPBlockUtils::insertBlockAfter(NewMiddle, LoopRegion);
-    VPBlockUtils::connectBlocks(NewMiddle, VPExitBlock);
-    VPBlockUtils::connectBlocks(NewMiddle, MiddleVPBB);
+  VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split");
+  VPBlockUtils::disconnectBlocks(LoopRegion, MiddleVPBB);
+  VPBlockUtils::insertBlockAfter(NewMiddle, LoopRegion);
+  VPBlockUtils::connectBlocks(NewMiddle, VPExitBlock);
+  VPBlockUtils::connectBlocks(NewMiddle, MiddleVPBB);
 
-    VPBuilder MiddleBuilder(NewMiddle);
-    MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {EarlyExitTaken});
-  }
+  VPBuilder MiddleBuilder(NewMiddle);
+  MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {EarlyExitTaken});
 
   // Replace the condition controlling the exit from the vector loop with one
   // exiting if either the original condition of the vector latch is true or any
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index dc5dec2f1b84a6..98f30c13c7e9b4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -127,13 +127,13 @@ struct VPlanTransforms {
   /// Update \p Plan to account for uncountable exit blocks in \p
   /// UncountableExitingBlocks by
   ///  * updating the condition to exit the vector loop to include the early
-  ///  exit conditions
+  ///    exit conditions
   ///  * splitting the original middle block to branch to the early exit blocks
-  ///  if taken. Returns false if the transformation wasn't successful.
-  static void
-  handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
-                             ArrayRef<BasicBlock *> UncountableExitingBlocks,
-                             VPRecipeBuilder &RecipeBuilder);
+  ///    if taken. Returns false if the transformation wasn't successful.
+  static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE,
+                                         Loop *OrigLoop,
+                                         BasicBlock *UncountableExitingBlock,
+                                         VPRecipeBuilder &RecipeBuilder);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
index 21433477c1d7a3..f7fd55a0f039a3 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; REQUIRES: asserts
-; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 -disable-output 2>&1 | FileCheck %s
 
 declare void @init_mem(ptr, i64);
 
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 52f82d007de4df..08a333fa865154 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
+; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 | FileCheck %s
 
 declare void @init_mem(ptr, i64);
 
@@ -11,21 +11,47 @@ define i64 @same_exit_block_phi_of_consts() {
 ; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
 ; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOOP_END:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP1]] ], [ 1, [[LOOP_INC]] ], [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[MIDDLE_SPLIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -62,19 +88,45 @@ define i64 @diff_exit_block_phi_of_consts() {
 ; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
 ; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOOP_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       loop.early.exit:
 ; CHECK-NEXT:    ret i64 0
 ; CHECK:       loop.end:
@@ -119,22 +171,66 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
 ; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
 ; CHECK-NEXT:    [[END_CLAMPED:%.*]] = and i32 [[END]], 1023
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[END]] to i10
+; CHECK-NEXT:    [[TMP20:%.*]] = zext i10 [[TMP19]] to i64
+; CHECK-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP20]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[END_CLAMPED]], i32 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i32 [[UMAX]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; CHECK-NEXT:    [[TMP4:%.*]] = add i8 1, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i8 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP2]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i8
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <4 x i1> [[TMP13]], splat (i1 true)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[TMP14]], splat (i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_SPLIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP16]], label [[FOUND:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[CMP_EARLY:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[CMP_EARLY]], label [[FOUND:%.*]], label [[FOR_INC]]
+; CHECK-NEXT:    br i1 [[CMP_EARLY]], label [[FOUND]], label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[IND_NEXT]] = add i8 [[IND]], 1
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32
 ; CHECK-NEXT:    [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY1]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       found:
 ; CHECK-NEXT:    ret i32 1
 ; CHECK:       exit:
@@ -183,14 +279,33 @@ define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SVAL:%.*]] = load i32, ptr [[S]], align 4
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[SVAL]], 0
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[COND]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 276
+; CHECK-NEXT:    [[TMP3:%.*]] = or i1 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_SPLIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP1]], label [[EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 266, [[MIDDLE_BLOCK]] ], [ -10, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ -10, [[ENTRY:%.*]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_INC]], label [[EARLY_EXIT:%.*]]
+; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_INC]], label [[EARLY_EXIT]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[IND_NEXT]] = add nsw i32 [[IND]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IND_NEXT]], 266
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       early.exit:
 ; CHECK-NEXT:    tail call void @abort()
 ; CHECK-NEXT:    unreachable
@@ -218,3 +333,15 @@ early.exit:
 for.end:
   ret i32 0
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
index d840646a259529..9df40d77ecc5fd 100644
--- a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
@@ -114,6 +114,162 @@ e2:
   ret i64 %p2
 }
 
+define i64 @multi_exiting_to_same_exit_live_in_exit_values() {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT:   IR %src = alloca [128 x i32], align 4
+; CHECK-NEXT:   IR call void @init(ptr %src)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:     CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<%3>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.src>
+; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:     WIDEN ir<%c.1> = icmp eq ir<%l>, ir<10>
+; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<exit>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT:   IR %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<1>, ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK:      No successors
+; CHECK-NEXT: }
+
+entry:
+  %src = alloca [128 x i32]
+  call void @init(ptr %src)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src
+  %c.1 = icmp eq i32 %l, 10
+  br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %exit, label %loop.header
+
+exit:
+  %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
+  ret i64 %p
+}
+
+define i64 @multi_exiting_to_same_exit_live_in_exit_values_2() {
+; CHECK-LABEL: VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<128> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT:   IR %src = alloca [128 x i32], align 4
+; CHECK-NEXT:   IR call void @init(ptr %src)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:     CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<%3>
+; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep.src>
+; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:     WIDEN ir<%c.1> = icmp eq ir<%l>, ir<10>
+; CHECK-NEXT:     EMIT vp<[[NOT1:%.+]]> = not ir<%c.1>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT vp<[[NOT2:%.+]]> = not vp<[[NOT1]]>
+; CHECK-NEXT:     EMIT vp<[[EA_TAKEN:%.+]]> = any-of vp<[[NOT2]]>
+; CHECK-NEXT:     EMIT vp<[[LATCH_CMP:%.+]]> = icmp eq vp<%index.next>, vp<[[VTC]]>
+; CHECK-NEXT:     EMIT vp<[[EC:%.+]]> = or vp<[[EA_TAKEN]]>, vp<[[LATCH_CMP]]>
+; CHECK-NEXT:     EMIT branch-on-cond vp<[[EC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.split
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.split:
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[EA_TAKEN]]>
+; CHECK-NEXT: Successor(s): ir-bb<exit>, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT:   IR %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<1>, ir<0>)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[MIDDLE_CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT: ir-bb<loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.header>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+; CHECK:      No successors
+; CHECK-NEXT: }
+
+entry:
+  %src = alloca [128 x i32]
+  call void @init(ptr %src)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src
+  %c.1 = icmp eq i32 %l, 10
+  br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %exit, label %loop.header
+
+exit:
+  %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ]
+  ret i64 %p
+
+; uselistorder directives
+  uselistorder label %exit, { 1, 0 }
+}
+
+
 define i64 @multi_exiting_to_same_exit_load_exit_value() {
 ; CHECK-NOT: VPlan 'Final VPlan for VF={4},UF={1}' {
 
diff --git a/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll b/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll
index cd91d07120f9ee..5b2a95f1b368c3 100644
--- a/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
+; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization  -force-vector-width=4 | FileCheck %s
 
 declare void @init_mem(ptr, i64);
 

>From 06c3d39f4f2ce65569ec71fef8d15ab36aa2f24b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 23 Nov 2024 15:40:10 +0000
Subject: [PATCH 11/15] [VPlan] Print incoming VPBB for phi VPIRInstruction
 (NFC).

---
 llvm/lib/Transforms/Vectorize/VPlan.h          |  2 +-
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp |  4 +++-
 .../RISCV/vplan-vp-intrinsics-reduction.ll     |  8 ++++----
 .../first-order-recurrence-chains-vplan.ll     | 18 +++++++++---------
 ...t-order-recurrence-sink-replicate-region.ll | 14 +++++++-------
 .../interleave-and-scalarize-only.ll           |  2 +-
 .../Transforms/LoopVectorize/vplan-printing.ll | 10 +++++-----
 7 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 00c69dd53a3635..fc7e574a7d88cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -643,7 +643,7 @@ class VPBlockBase {
   virtual void dropAllReferences(VPValue *NewValue) = 0;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void printAsOperand(raw_ostream &OS, bool PrintType) const {
+  void printAsOperand(raw_ostream &OS, bool PrintType = false) const {
     OS << getName();
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5527ed3c1fad08..57649e0233f9d5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -869,7 +869,9 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
 
   if (getNumOperands() != 0) {
     O << " (extra operand: ";
-    printOperands(O, SlotTracker);
+    getOperand(0)->printAsOperand(O, SlotTracker);
+    O << " from ";
+    getParent()->getPredecessors()[0]->printAsOperand(O);
     O << ")";
   }
 }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 414f39d557044a..ab541f6fa94e66 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -60,7 +60,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: ir-bb<for.end>:
-; IF-EVL-OUTLOOP-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]>)
+; IF-EVL-OUTLOOP-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]> from middle.block)
 ; IF-EVL-OUTLOOP-NEXT: No successors
 ; IF-EVL-OUTLOOP-EMPTY:
 ; IF-EVL-OUTLOOP-NEXT: scalar.ph:
@@ -110,7 +110,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: ir-bb<for.end>:
-; IF-EVL-INLOOP-NEXT:  IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]>)
+; IF-EVL-INLOOP-NEXT:  IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]> from middle.block)
 ; IF-EVL-INLOOP-NEXT: No successors
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: scalar.ph:
@@ -156,7 +156,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: ir-bb<for.end>:
-; NO-VP-OUTLOOP-NEXT:  IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]>)
+; NO-VP-OUTLOOP-NEXT:  IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]> from middle.block)
 ; NO-VP-OUTLOOP-NEXT: No successors
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: scalar.ph:
@@ -202,7 +202,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: ir-bb<for.end>:
-; NO-VP-INLOOP-NEXT:   IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]>)
+; NO-VP-INLOOP-NEXT:   IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]> from middle.block)
 ; NO-VP-INLOOP-NEXT: No successors
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index bcacfb358ec05a..517de8be5c9987 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -48,8 +48,8 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) {
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]>)
-; CHECK-NEXT:    IR   %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1)
+; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
+; CHECK-NEXT:    IR   %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1 from scalar.ph)
 ; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT:  No successors
@@ -125,9 +125,9 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]>)
-; CHECK-NEXT:    IR   %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1)
-; CHECK-NEXT:    IR   %for.3 = phi i16 [ 33, %entry ], [ %for.2, %loop ] (extra operand: vp<[[RESUME_3_P]]>.2)
+; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
+; CHECK-NEXT:    IR   %for.2 = phi i16 [ 33, %entry ], [ %for.1, %loop ] (extra operand: vp<[[RESUME_2_P]]>.1 from scalar.ph)
+; CHECK-NEXT:    IR   %for.3 = phi i16 [ 33, %entry ], [ %for.2, %loop ] (extra operand: vp<[[RESUME_3_P]]>.2 from scalar.ph)
 ; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT: No successors
@@ -205,8 +205,8 @@ define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-; CHECK-NEXT:   IR   %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]>)
-; CHECK-NEXT:   IR   %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1)
+; CHECK-NEXT:   IR   %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]> from scalar.ph)
+; CHECK-NEXT:   IR   %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1 from scalar.ph)
 ; CHECK:     No successors
 ; CHECK-NEXT: }
 ;
@@ -279,8 +279,8 @@ define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
-; CHECK-NEXT:   IR   %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]>)
-; CHECK-NEXT:   IR   %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1)
+; CHECK-NEXT:   IR   %for.x = phi i64 [ %for.x.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_X]]> from scalar.ph)
+; CHECK-NEXT:   IR   %for.y = phi i32 [ %for.x.prev, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_Y]]>.1 from scalar.ph)
 ; CHECK:     No successors
 ; CHECK-NEXT: }
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 8ae538cf63986b..d0c811763a522a 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -85,7 +85,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]>)
+; CHECK-NEXT:   IR   %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
@@ -172,7 +172,7 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]>)
+; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
@@ -235,7 +235,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>
-; CHECK-NEXT:   IR %res = phi i32 [ %and.red.next, %loop ] (extra operand: vp<[[RED_EX]]>)
+; CHECK-NEXT:   IR %res = phi i32 [ %and.red.next, %loop ] (extra operand: vp<[[RED_EX]]> from middle.block)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
@@ -244,7 +244,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]>)
+; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
 ; CHECK-NEXT:   IR   %and.red = phi i32 [ 1234, %entry ], [ %and.red.next, %loop ]
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
@@ -355,7 +355,7 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]>)
+; CHECK-NEXT:   IR   %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
 ; CHECK:        IR   %ec = icmp eq i32 %iv.next, 20001
 ; CHECK-NEXT: No successors
@@ -452,7 +452,7 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
 ; CHECK-NEXT: Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
-; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]>)
+; CHECK-NEXT:   IR   %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK-NEXT:   IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
 ; CHECK:        IR   %C = icmp sgt i32 %iv.next, %recur.next
 ; CHECK-NEXT: No successors
@@ -539,7 +539,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<loop>:
 ; CHECK-NEXT:   IR   %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT:   IR   %.pn = phi i32 [ 0, %entry ], [ %l, %loop ] (extra operand: vp<[[RESUME_1_P]]>)
+; CHECK-NEXT:   IR   %.pn = phi i32 [ 0, %entry ], [ %l, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph)
 ; CHECK:        IR   %ec = icmp ugt i64 %iv, 3
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index a71666d8c3167a..dd58dc81ccedde 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -227,7 +227,7 @@ exit:
 ; DBG-EMPTY:
 ; DBG-NEXT: ir-bb<loop>:
 ; DBG-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-; DBG-NEXT:   IR   %for = phi i32 [ 0, %entry ], [ %iv.trunc, %loop ] (extra operand: vp<[[RESUME_P]]>)
+; DBG-NEXT:   IR   %for = phi i32 [ 0, %entry ], [ %iv.trunc, %loop ] (extra operand: vp<[[RESUME_P]]> from scalar.ph)
 ; DBG:        IR   %ec = icmp slt i32 %iv.next.trunc, %n
 ; DBG-NEXT: No successors
 ; DBG-NEXT: }
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 6bb20a301e0ade..195f6a48640e54 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -171,7 +171,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.end>
-; CHECK-NEXT:  IR %red.next.lcssa = phi float [ %red.next, %for.body ] (extra operand: vp<[[RED_EX]]>)
+; CHECK-NEXT:  IR %red.next.lcssa = phi float [ %red.next, %for.body ] (extra operand: vp<[[RED_EX]]> from middle.block)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
@@ -476,7 +476,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<for.end>
-; CHECK-NEXT:   IR %muladd.lcssa = phi float [ %muladd, %for.body ] (extra operand: vp<[[RED_EX]]>)
+; CHECK-NEXT:   IR %muladd.lcssa = phi float [ %muladd, %for.body ] (extra operand: vp<[[RED_EX]]> from middle.block)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
@@ -716,7 +716,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>
-; CHECK-NEXT:   IR %lcssa = phi i32 [ %add, %loop ] (extra operand: vp<[[EXIT]]>)
+; CHECK-NEXT:   IR %lcssa = phi i32 [ %add, %loop ] (extra operand: vp<[[EXIT]]> from middle.block)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
@@ -1111,7 +1111,7 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>
-; CHECK-NEXT:   IR %for.1.lcssa = phi i16 [ %for.1, %loop ] (extra operand: vp<[[FOR_RESULT]]>)
+; CHECK-NEXT:   IR %for.1.lcssa = phi i16 [ %for.1, %loop ] (extra operand: vp<[[FOR_RESULT]]> from middle.block)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
@@ -1119,7 +1119,7 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) {
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_P]]>)
+; CHECK-NEXT:    IR   %for.1 = phi i16 [ 22, %entry ], [ %for.1.next, %loop ] (extra operand: vp<[[RESUME_P]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
 ; CHECK:         IR   %exitcond.not = icmp eq i64 %iv.next, 1000
 ; CHECK-NEXT: No successors

>From 552bd9161f07ace9083c3ab409219add05aaf2bc Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 23 Nov 2024 18:59:09 +0000
Subject: [PATCH 12/15] !fixup update recipe printing

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp         | 10 ++++++----
 .../LoopVectorize/uncountable-early-exit-vplan.ll      |  8 ++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 57649e0233f9d5..761f2949126e49 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -868,10 +868,12 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
   O << Indent << "IR " << I;
 
   if (getNumOperands() != 0) {
-    O << " (extra operand: ";
-    getOperand(0)->printAsOperand(O, SlotTracker);
-    O << " from ";
-    getParent()->getPredecessors()[0]->printAsOperand(O);
+    O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
+    interleaveComma(enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
+                         Op.value()->printAsOperand(O, SlotTracker);
+                         O << " from ";
+                        getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
+                         });
     O << ")";
   }
 }
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
index 9df40d77ecc5fd..2583070fc9820a 100644
--- a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
@@ -41,7 +41,7 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; CHECK-NEXT: Successor(s): ir-bb<e1>, middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<e1>:
-; CHECK-NEXT:   IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>)
+; CHECK-NEXT:   IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0> from middle.split)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
@@ -50,7 +50,7 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; CHECK-NEXT: Successor(s): ir-bb<e2>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<e2>:
-; CHECK-NEXT:  IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>)
+; CHECK-NEXT:  IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1> from middle.block)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph:
@@ -152,7 +152,7 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values() {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT:   IR %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<1>, ir<0>)
+; CHECK-NEXT:   IR %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operands: ir<1> from middle.block, ir<0> from middle.split)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
@@ -228,7 +228,7 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values_2() {
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT:   IR %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<1>, ir<0>)
+; CHECK-NEXT:   IR %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operands: ir<1> from middle.block, ir<0> from middle.split)
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:

>From 00dea4a96e5b9cf863d30f3f5f06aab6acbd21a7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 23 Nov 2024 19:16:35 +0000
Subject: [PATCH 13/15] !fixup add dbg message

---
 .../Vectorize/LoopVectorizationLegality.cpp       | 15 +++++++++++----
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp   |  7 ++++++-
 .../LoopVectorize/early_exit_legality.ll          |  6 +++---
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 0267fb1adb16d6..0dcf5ecec13b0f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1375,10 +1375,17 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
 }
 
 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
-  // When vectorizing early exits, create predicates for all blocks, except the
-  // header.
-  if (hasUncountableEarlyExit() && BB != TheLoop->getHeader())
-    return true;
+  // When vectorizing early exits, create predicates for the latch block. The
+  // early exiting block must be a direct predecessor of the latch at the
+  // moment.
+  BasicBlock *Latch = TheLoop->getLoopLatch();
+  if (hasUncountableEarlyExit()) {
+    assert(
+        getUncountableExitingBlocks().size() == 1 &&
+        is_contained(predecessors(Latch), getUncountableExitingBlocks()[0]) &&
+        "Uncountable exiting block must be a direct predecessor of latch");
+    return BB == Latch;
+  }
   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5134c4ec7d225c..bbd560c75f5afc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9239,8 +9239,13 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
       OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
-  if (!addUsersInExitBlocks(*Plan, ExitUsersToFix))
+  if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
+    reportVectorizationFailure(
+        "Some exit values in loop with uncountable exit not supported yet",
+        "Some exit values in loop with uncountable exit not supported yet",
+        "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
     return nullptr;
+  }
 
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
index f7fd55a0f039a3..2a99693523d3cf 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
@@ -11,7 +11,7 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
 ; CHECK-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check'
 ; CHECK:       Found an early exit loop with symbolic max backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32)))<nsw>
 ; CHECK-NEXT:  LV: We can vectorize this loop!
-; CHECK-NEXT:  LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
+; CHECK-NOT:   LV: Not vectorizing:
 entry:
   %p1 = alloca [1024 x i32]
   %p2 = alloca [1024 x i32]
@@ -49,7 +49,7 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1'
 ; CHECK:       LV: Found an early exit loop with symbolic max backedge taken count: 63
 ; CHECK-NEXT:  LV: We can vectorize this loop!
-; CHECK-NEXT:  LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
+; CHECK-NOT:   LV: Not vectorizing
 entry:
   %p1 = alloca [1024 x i8]
   %p2 = alloca [1024 x i8]
@@ -141,7 +141,7 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit'
 ; CHECK:       LV: Found an early exit loop with symbolic max backedge taken count: 63
 ; CHECK-NEXT:  LV: We can vectorize this loop!
-; CHECK-NEXT:  LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
+; CHECK:       LV: Not vectorizing: Some exit values in loop with uncountable exit not supported yet.
 entry:
   %p1 = alloca [1024 x i8]
   call void @init_mem(ptr %p1, i64 1024)

>From 7b8866d8329d53e55239b7f824a2e2f0b186da1c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 24 Nov 2024 07:45:12 +0000
Subject: [PATCH 14/15] !fixup fix formatting

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 761f2949126e49..7d13f437e934c8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -869,11 +869,12 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
 
   if (getNumOperands() != 0) {
     O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
-    interleaveComma(enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
-                         Op.value()->printAsOperand(O, SlotTracker);
-                         O << " from ";
-                        getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
-                         });
+    interleaveComma(
+        enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
+          Op.value()->printAsOperand(O, SlotTracker);
+          O << " from ";
+          getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
+        });
     O << ")";
   }
 }

>From e4310165519a22a36d5bb6fcdf4b335bbd34a0a6 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 8 Dec 2024 20:51:51 +0000
Subject: [PATCH 15/15] !restore test checks.

---
 .../LoopVectorize/X86/multi-exit-cost.ll       | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
index 3517b5c484e37d..4e768074019d31 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
@@ -5,18 +5,18 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64
 ; CHECK-LABEL: define i64 @test_value_in_exit_compare_chain_used_outside(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[X:%.*]], i64 range(i64 1, 32) [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[N]], -1
-; CHECK-NEXT:    [[TMP4:%.*]] = freeze i64 [[TMP3]]
-; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[X]])
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[TMP0]]
+; CHECK-NEXT:    [[UMIN2:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[X]])
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[UMIN2]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
 ; CHECK:       [[VECTOR_SCEVCHECK]]:
-; CHECK-NEXT:    [[TMP32:%.*]] = add nsw i64 [[N]], -1
-; CHECK-NEXT:    [[TMP33:%.*]] = freeze i64 [[TMP32]]
-; CHECK-NEXT:    [[UMIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP33]], i64 [[X]])
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[UMIN1]] to i1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[UMIN1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[N]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze i64 [[TMP3]]
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[X]])
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[UMIN]] to i1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[UMIN]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:



More information about the llvm-commits mailing list