[llvm] [LV] Transform to handle exits in the scalar loop (PR #148626)

Graham Hunter via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 18 04:16:08 PST 2025


https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/148626

>From f1c98e83412e659aa0ed1a73ae4ecab33e543d4a Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 1 Jul 2025 13:08:48 +0000
Subject: [PATCH] Transform code

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  18 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   1 +
 llvm/lib/Transforms/Vectorize/VPlan.h         |  25 ++
 .../Vectorize/VPlanConstruction.cpp           |   7 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 224 +++++++++++---
 .../Transforms/Vectorize/VPlanTransforms.h    |  10 +
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |  30 +-
 .../early-exit-handle-exits-in-scalar-loop.ll | 226 ++++++++++++++
 .../LoopVectorize/early_exit_legality.ll      |   1 +
 .../LoopVectorize/single_early_exit.ll        | 284 ++++++++++++++++++
 .../uncountable-early-exit-vplan.ll           | 195 ++++++++++++
 11 files changed, 977 insertions(+), 44 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/early-exit-handle-exits-in-scalar-loop.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 356d759b94799..49d53de7ab98d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -402,6 +402,10 @@ static cl::opt<bool> ConsiderRegPressure(
     "vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
     cl::desc("Discard VFs if their register pressure is too high."));
 
+static cl::opt<bool> HandleEarlyExitsInScalarTail(
+    "handle-early-exits-in-scalar-tail", cl::init(false), cl::Hidden,
+    cl::desc("Use the scalar tail to deal with early exit logic"));
+
 // Likelyhood of bypassing the vectorized loop because there are zero trips left
 // after prolog. See `emitIterationCountCheck`.
 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
@@ -507,8 +511,7 @@ class InnerLoopVectorizer {
       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
         VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
         Cost(CM), BFI(BFI), PSI(PSI), RTChecks(RTChecks), Plan(Plan),
-        VectorPHVPBB(cast<VPBasicBlock>(
-            Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
+        VectorPHVPBB(cast<VPBasicBlock>(Plan.getVectorPreheader())) {}
 
   virtual ~InnerLoopVectorizer() = default;
 
@@ -8304,6 +8307,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   auto VPlan0 = VPlanTransforms::buildVPlan0(
       OrigLoop, *LI, Legal->getWidestInductionType(),
       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer);
+  VPlan0->setEarlyExitContinuesInScalarLoop(Legal->hasUncountableEarlyExit() &&
+                                            HandleEarlyExitsInScalarTail);
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
@@ -8318,6 +8323,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
       if (CM.foldTailWithEVL())
         VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
                                  *Plan, CM.getMaxSafeElements());
+
+      // See if we can convert an early exit vplan to bail out to a scalar
+      // loop if state-changing operations (like stores) are present and
+      // an exit will be taken in the next vector iteration.
+      // If not, discard the plan.
+      if (!Plan->hasScalarVFOnly() && HandleEarlyExitsInScalarTail &&
+          !VPlanTransforms::runPass(
+              VPlanTransforms::handleUncountableExitsInScalarLoop, *Plan))
+        break;
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
     }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f99f51d5846cf..d0ec6521dbb25 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1214,6 +1214,7 @@ VPlan *VPlan::duplicate() {
       NewPlan->ExitBlocks.push_back(cast<VPIRBasicBlock>(VPB));
   }
 
+  NewPlan->setEarlyExitContinuesInScalarLoop(EarlyExitContinuesInScalarLoop);
   return NewPlan;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a88ddf217da9b..941b09320a970 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4255,6 +4255,15 @@ class VPlan {
   /// VPlan is destroyed.
   SmallVector<VPBlockBase *> CreatedBlocks;
 
+  /// The entry block in a vplan, which may be a check block that needs to
+  /// be wired up in the right place with existing check blocks.
+  VPBasicBlock *EarlyExitPreheader = nullptr;
+
+  /// Indicates that an early exit loop will exit the vector loop before an
+  /// uncountable exit condition is reached, and that the scalar loop must
+  /// perform the last few iterations.
+  bool EarlyExitContinuesInScalarLoop = false;
+
   /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader
   /// wrapping the original header of the scalar loop.
   VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader)
@@ -4296,12 +4305,17 @@ class VPlan {
   /// Returns the preheader of the vector loop region, if one exists, or null
   /// otherwise.
   VPBasicBlock *getVectorPreheader() {
+    if (EarlyExitPreheader)
+      return EarlyExitPreheader;
     VPRegionBlock *VectorRegion = getVectorLoopRegion();
     return VectorRegion
                ? cast<VPBasicBlock>(VectorRegion->getSinglePredecessor())
                : nullptr;
   }
 
+  /// Overrides the current vplan preheader block.
+  void setEarlyExitPreheader(VPBasicBlock *BB) { EarlyExitPreheader = BB; }
+
   /// Returns the VPRegionBlock of the vector loop.
   LLVM_ABI_FOR_TEST VPRegionBlock *getVectorLoopRegion();
   LLVM_ABI_FOR_TEST const VPRegionBlock *getVectorLoopRegion() const;
@@ -4565,6 +4579,17 @@ class VPlan {
            (ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1);
   }
 
+  /// Returns true if the vector iteration containing an exit should be handled
+  /// in the scalar loop instead of by masking.
+  bool shouldEarlyExitContinueInScalarLoop() const {
+    return EarlyExitContinuesInScalarLoop;
+  }
+
+  /// If set to true, early exits should be handled in the scalar loop.
+  void setEarlyExitContinuesInScalarLoop(bool Continues) {
+    EarlyExitContinuesInScalarLoop = Continues;
+  }
+
   /// Returns true if the scalar tail may execute after the vector loop. Note
   /// that this relies on unneeded branches to the scalar tail loop being
   /// removed.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 612202d049774..9bc0bc64ac2e3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -588,10 +588,13 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
         handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan,
                                    cast<VPBasicBlock>(HeaderVPB), LatchVPBB);
         HandledUncountableEarlyExit = true;
-      } else {
+      }
+
+      if (!HasUncountableEarlyExit ||
+          Plan.shouldEarlyExitContinueInScalarLoop())
         for (VPRecipeBase &R : EB->phis())
           cast<VPIRPhi>(&R)->removeIncomingValueFor(Pred);
-      }
+
       cast<VPBasicBlock>(Pred)->getTerminator()->eraseFromParent();
       VPBlockUtils::disconnectBlocks(Pred, EB);
     }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 26563242de283..523edf5e0a21b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -20,6 +20,7 @@
 #include "VPlanHelpers.h"
 #include "VPlanPatternMatch.h"
 #include "VPlanUtils.h"
+#include "VPlanValue.h"
 #include "VPlanVerifier.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/PostOrderIterator.h"
@@ -35,6 +36,8 @@
 #include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
@@ -1837,7 +1840,7 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
       HeaderR.eraseFromParent();
     }
 
-    VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
+    VPBlockBase *Preheader = Plan.getVectorPreheader();
     VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
     VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
     VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
@@ -3017,8 +3020,7 @@ void VPlanTransforms::replaceSymbolicStrides(
   // evolution.
   auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
     auto *R = cast<VPRecipeBase>(&U);
-    return R->getRegion() ||
-           R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
+    return R->getParent() || R->getParent() == Plan.getVectorPreheader();
   };
   ValueToSCEVMapTy RewriteMap;
   for (const SCEV *Stride : StridesMap.values()) {
@@ -3533,7 +3535,8 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
                                                  VPBasicBlock *LatchVPBB) {
   VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
   if (!EarlyExitVPBB->getSinglePredecessor() &&
-      EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
+      EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB &&
+      !Plan.shouldEarlyExitContinueInScalarLoop()) {
     assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
            EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
            "unsupported early exit VPBB");
@@ -3558,42 +3561,45 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
   // block if CondToEarlyExit.
   VPValue *IsEarlyExitTaken =
       Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
-  VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
-  VPBasicBlock *VectorEarlyExitVPBB =
-      Plan.createVPBasicBlock("vector.early.exit");
-  VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
-  VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
-  NewMiddle->swapSuccessors();
-
-  VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
-
-  // Update the exit phis in the early exit block.
-  VPBuilder MiddleBuilder(NewMiddle);
-  VPBuilder EarlyExitB(VectorEarlyExitVPBB);
-  for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
-    auto *ExitIRI = cast<VPIRPhi>(&R);
-    // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
-    // a single predecessor and 1 if it has two.
-    unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
-    if (ExitIRI->getNumOperands() != 1) {
-      // The first of two operands corresponds to the latch exit, via MiddleVPBB
-      // predecessor. Extract its last lane.
-      ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
-    }
+  if (!Plan.shouldEarlyExitContinueInScalarLoop()) {
+    VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
+    VPBasicBlock *VectorEarlyExitVPBB =
+        Plan.createVPBasicBlock("vector.early.exit");
+    VPBlockUtils::insertOnEdge(LatchVPBB, MiddleVPBB, NewMiddle);
+    VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
+    NewMiddle->swapSuccessors();
+
+    VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
+
+    // Update the exit phis in the early exit block.
+    VPBuilder MiddleBuilder(NewMiddle);
+    VPBuilder EarlyExitB(VectorEarlyExitVPBB);
+    for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
+      auto *ExitIRI = cast<VPIRPhi>(&R);
+      // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
+      // a single predecessor and 1 if it has two.
+      unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
+      if (ExitIRI->getNumOperands() != 1) {
+        // The first of two operands corresponds to the latch exit, via
+        // MiddleVPBB predecessor. Extract its last lane.
+        ExitIRI->extractLastLaneOfFirstOperand(MiddleBuilder);
+      }
 
-    VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
-    if (!IncomingFromEarlyExit->isLiveIn()) {
-      // Update the incoming value from the early exit.
-      VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
-          VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
-          "first.active.lane");
-      IncomingFromEarlyExit = EarlyExitB.createNaryOp(
-          VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
-          nullptr, "early.exit.value");
-      ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
+      VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
+      if (!IncomingFromEarlyExit->isLiveIn()) {
+        // Update the incoming value from the early exit.
+        VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
+            VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,
+            "first.active.lane");
+        IncomingFromEarlyExit =
+            EarlyExitB.createNaryOp(VPInstruction::ExtractLane,
+                                    {FirstActiveLane, IncomingFromEarlyExit},
+                                    nullptr, "early.exit.value");
+        ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
+      }
     }
+    MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
   }
-  MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
 
   // Replace the condition controlling the non-early exit from the vector loop
   // with one exiting if either the original condition of the vector latch is
@@ -3610,6 +3616,147 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
   LatchExitingBranch->eraseFromParent();
 }
 
+bool VPlanTransforms::handleUncountableExitsInScalarLoop(VPlan &Plan) {
+  assert(!Plan.hasScalarVFOnly() &&
+         "Cannot transform uncountable exits in scalar loop");
+
+  // We can abandon a vplan entirely if we return false here, so we shouldn't
+  // crash if some earlier assumptions on scalar IR don't hold for the vplan
+  // version of the loop.
+  VPCanonicalIVPHIRecipe *IV = Plan.getVectorLoopRegion()->getCanonicalIV();
+  VPInstruction *IVUpdate = dyn_cast<VPInstruction>(IV->getBackedgeValue());
+  if (!IVUpdate)
+    return false;
+
+  SmallVector<VPRecipeBase *, 2> GEPs;
+  SmallVector<VPRecipeBase *, 8> ConditionRecipes;
+
+  std::optional<VPValue *> Cond =
+      vputils::getRecipesForUncountableExit(Plan, ConditionRecipes, GEPs);
+  if (!Cond)
+    return false;
+
+  // Check GEPs to see if we can link them to the canonical IV.
+  using namespace llvm::VPlanPatternMatch;
+  for (auto *GEP : GEPs)
+    if (!match(GEP,
+               m_GetElementPtr(m_LiveIn(),
+                               m_ScalarIVSteps(m_Specific(IV), m_SpecificInt(1),
+                                               m_Specific(&Plan.getVF())))))
+      return false;
+
+  // Clone the condition recipes into the preheader
+  SmallDenseMap<VPRecipeBase *, VPRecipeBase *, 8> CloneMap;
+  VPBasicBlock *VectorPH = Plan.getVectorPreheader();
+  for (VPRecipeBase *R : reverse(ConditionRecipes)) {
+    VPRecipeBase *Clone = R->clone();
+    VectorPH->appendRecipe(Clone);
+    CloneMap[R] = Clone;
+  }
+
+  // Remap the cloned recipes to use the corresponding operands.
+  for (VPRecipeBase *R : ConditionRecipes) {
+    auto *Clone = CloneMap.at(R);
+    for (unsigned I = 0; I < R->getNumOperands(); ++I)
+      if (VPRecipeBase *OpR =
+              CloneMap.lookup(R->getOperand(I)->getDefiningRecipe()))
+        Clone->setOperand(I, OpR->getVPSingleValue());
+  }
+
+  // Adjust preheader GEPs to match the value they would have for the first
+  // iteration of the vector body.
+  for (auto *GEP : GEPs)
+    CloneMap.at(GEP)->setOperand(1, IV->getStartValue());
+
+  // Split vector preheader to form a new bypass block.
+  VPBasicBlock *NewPH = VectorPH->splitAt(VectorPH->end());
+  VPBasicBlock *ScalarPH = Plan.getScalarPreheader();
+
+  // Create bypass block branch.
+  VPRecipeBase *Uncountable = (*Cond)->getDefiningRecipe();
+  VPRecipeBase *PHUncountable = CloneMap.at(Uncountable);
+  VPBuilder PHBuilder(VectorPH, VectorPH->end());
+  VPValue *PHAnyOf = PHBuilder.createNaryOp(
+      VPInstruction::AnyOf, {PHUncountable->getVPSingleValue()});
+  PHBuilder.createNaryOp(VPInstruction::BranchOnCond, {PHAnyOf},
+                         PHUncountable->getDebugLoc());
+  VectorPH->clearSuccessors();
+  NewPH->clearPredecessors();
+  VPBlockUtils::connectBlocks(VectorPH, ScalarPH);
+  VPBlockUtils::connectBlocks(VectorPH, NewPH);
+
+  // Modify plan so that other check blocks (e.g. SCEVs) can be attached to
+  // the correct block.
+  Plan.setEarlyExitPreheader(VectorPH);
+
+  // Fix up the resume phi in scalar preheader -- we might not have reached
+  // the calculated maximum vector tripcount, so just use the next value of IV.
+  VPBasicBlock *MiddleBlock = Plan.getMiddleBlock();
+  VPValue *VecTC = &Plan.getVectorTripCount();
+  for (VPRecipeBase &PHI : ScalarPH->phis()) {
+    VPPhi *ResumePHI = dyn_cast<VPPhi>(&PHI);
+    VPValue *EntryVal = nullptr;
+    for (unsigned I = 0; I < ResumePHI->getNumIncoming(); ++I) {
+      const VPBasicBlock *Block = ResumePHI->getIncomingBlock(I);
+      VPValue *V = ResumePHI->getIncomingValue(I);
+      if (Block == Plan.getEntry()) {
+        EntryVal = ResumePHI->getIncomingValue(I);
+      } else if (Block == MiddleBlock && V == VecTC) {
+        ResumePHI->setOperand(I, IVUpdate);
+      } else {
+        return false;
+      }
+    }
+
+    if (!EntryVal)
+      return false;
+    ResumePHI->addOperand(EntryVal);
+  }
+
+  // Move the IV update if necessary, then update the index operand of the GEP
+  // so that we load the next vector iteration's exit condition data.
+  VPDominatorTree VPDT(Plan);
+  for (auto *GEP : GEPs) {
+    if (!VPDT.properlyDominates(IVUpdate, GEP))
+      IVUpdate->moveBefore(*GEP->getParent(), GEP->getIterator());
+    GEP->setOperand(1, IVUpdate);
+  }
+
+  // Convert loads for the next vector iteration to use a mask so that we
+  // avoid any accesses that the scalar loop would not have performed.
+  for (VPRecipeBase *R : ConditionRecipes) {
+    if (auto *Load = dyn_cast<VPWidenLoadRecipe>(R)) {
+      // Bail out for now if it's already conditional.
+      if (Load->isMasked())
+        return false;
+      VPBuilder MaskBuilder(R);
+      VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
+          ConstantInt::get(IntegerType::getInt64Ty(Plan.getContext()), 1));
+      VPValue *LaneMask = MaskBuilder.createNaryOp(
+          VPInstruction::ActiveLaneMask,
+          {IVUpdate, &Plan.getVectorTripCount(), ALMMultiplier}, nullptr,
+          "uncountable.exit.mask");
+      VPWidenLoadRecipe *NewLoad = new VPWidenLoadRecipe(
+          *(cast<LoadInst>(Load->getUnderlyingValue())), Load->getOperand(0),
+          LaneMask, Load->isConsecutive(), Load->isReverse(), *Load,
+          Load->getDebugLoc());
+      MaskBuilder.insert(NewLoad);
+      Load->replaceAllUsesWith(NewLoad);
+      Load->eraseFromParent();
+    }
+  }
+
+  // Update middle block branch to use IVUpdate vs. the full trip count,
+  // since we may be exiting the vector loop early.
+  VPRecipeBase *OldTerminator = MiddleBlock->getTerminator();
+  VPBuilder MiddleBuilder(OldTerminator);
+  VPValue *FullTC =
+      MiddleBuilder.createICmp(CmpInst::ICMP_EQ, IVUpdate, Plan.getTripCount());
+  OldTerminator->setOperand(0, FullTC);
+
+  return true;
+}
+
 /// This function tries convert extended in-loop reductions to
 /// VPExpressionRecipe and clamp the \p Range if it is beneficial and
 /// valid. The created recipe must be decomposed to its constituent
@@ -4561,8 +4708,7 @@ void VPlanTransforms::addScalarResumePhis(
   auto *ScalarPH = Plan.getScalarPreheader();
   auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
-  VPBuilder VectorPHBuilder(
-      cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
+  VPBuilder VectorPHBuilder(Plan.getVectorPreheader());
   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
   VPBuilder ScalarPHBuilder(ScalarPH);
   for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 708ea4185e1cb..ac833487395df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -262,6 +262,16 @@ struct VPlanTransforms {
                                          VPlan &Plan, VPBasicBlock *HeaderVPBB,
                                          VPBasicBlock *LatchVPBB);
 
+  /// Update \p Plan to check whether the next iteration of the vector loop
+  /// would exit (using any exit type) and if so branch to the scalar loop
+  /// instead. This requires identifying the recipes that form the conditions
+  /// for exiting, cloning them to the preheader, then adjusting both the
+  /// preheader recipes (to check the first vector iteration) and those in
+  /// the vector loop (to check the next vector iteration instead of the
+  /// current one). This can be used to avoid complex masking for state-changing
+  /// recipes (like stores).
+  static bool handleUncountableExitsInScalarLoop(VPlan &Plan);
+
   /// Replace loop regions with explicit CFG.
   static void dissolveLoopRegions(VPlan &Plan);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 2536d61392ed1..9fefbd8f4ce3f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -355,17 +355,45 @@ vputils::getRecipesForUncountableExit(VPlan &Plan,
       if (Load->isMasked())
         return std::nullopt;
 
+      Recipes.push_back(Load);
+
+      // Look through vector-pointer recipes.
       VPValue *GEP = Load->getAddr();
+      if (auto *VecPtrR = dyn_cast<VPVectorPointerRecipe>(GEP)) {
+        Recipes.push_back(VecPtrR);
+        GEP = VecPtrR->getOperand(0);
+      }
+
+      // We only support two-operand GEPS with match
+      if (auto *R = GEP->getDefiningRecipe(); !R || R->getNumOperands() != 2)
+        return std::nullopt;
+
       if (!match(GEP, m_GetElementPtr(m_LiveIn(), m_VPValue())))
         return std::nullopt;
 
-      Recipes.push_back(Load);
       Recipes.push_back(GEP->getDefiningRecipe());
       GEPs.push_back(GEP->getDefiningRecipe());
     } else
       return std::nullopt;
   }
 
+  // If we couldn't match anything, don't return the condition. It may be
+  // defined outside the loop.
+  if (Recipes.empty())
+    return std::nullopt;
+
+#ifndef NDEBUG
+  // Check dominance ordering
+  VPRecipeBase *RA = Recipes.front();
+  VPDominatorTree VPDT(Plan);
+  bool Ordered = all_of(drop_begin(Recipes), [&VPDT, &RA](VPRecipeBase *RB) {
+    bool Dominates = VPDT.properlyDominates(RB, RA);
+    RA = RB;
+    return Dominates;
+  });
+  assert(Ordered && "Uncountable exit recipes unordered");
+#endif
+
   return UncountableCondition;
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/early-exit-handle-exits-in-scalar-loop.ll b/llvm/test/Transforms/LoopVectorize/early-exit-handle-exits-in-scalar-loop.ll
new file mode 100644
index 0000000000000..453c9b178a81c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/early-exit-handle-exits-in-scalar-loop.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -S < %s -p loop-vectorize -handle-early-exits-in-scalar-tail -force-vector-width=4 | FileCheck %s
+
+define i32 @simple_contains(ptr align 4 dereferenceable(100) readonly %array, i32 %elt) {
+; CHECK-LABEL: define i32 @simple_contains(
+; CHECK-SAME: ptr readonly align 4 dereferenceable(100) [[ARRAY:%.*]], i32 [[ELT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ELT]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAY]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze <4 x i1> [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH_SPLIT:.*]]
+; CHECK:       [[VECTOR_PH_SPLIT]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH_SPLIT]] ], [ [[IV:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV]]
+; CHECK-NEXT:    [[UNCOUNTABLE_EXIT_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IV]], i64 24)
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LD_ADDR]], <4 x i1> [[UNCOUNTABLE_EXIT_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[IV]], 24
+; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[IV]], 25
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[NOT_FOUND:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IV]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_PH]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_INC:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[LD_ADDR1:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV1]]
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[LD_ADDR1]], align 4
+; CHECK-NEXT:    [[CMP_EARLY:%.*]] = icmp eq i32 [[LD]], [[ELT]]
+; CHECK-NEXT:    br i1 [[CMP_EARLY]], label %[[FOUND:.*]], label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 25
+; CHECK-NEXT:    br i1 [[CMP]], label %[[NOT_FOUND]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[FOUND]]:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       [[NOT_FOUND]]:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %ld.addr = getelementptr inbounds i32, ptr %array, i64 %iv
+  %ld = load i32, ptr %ld.addr, align 4
+  %cmp.early = icmp eq i32 %ld, %elt
+  br i1 %cmp.early, label %found, label %for.inc
+
+for.inc:
+  %iv.next = add nsw nuw i64 %iv, 1
+  %cmp = icmp eq i64 %iv.next, 25
+  br i1 %cmp, label %not.found, label %for.body
+
+found:
+  ret i32 1
+
+not.found:
+  ret i32 0
+}
+
+define i32 @contains_with_variable_tc(ptr readonly %array, i8 %elt, i64 %n) nofree nosync {
+; CHECK-LABEL: define i32 @contains_with_variable_tc(
+; CHECK-SAME: ptr readonly [[ARRAY:%.*]], i8 [[ELT:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[ARRAY]], i64 1), "dereferenceable"(ptr [[ARRAY]], i64 [[N]]) ]
+; CHECK-NEXT:    [[ZERO_TC:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[ZERO_TC]], label %[[NOT_FOUND:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[ITERS:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[ELT]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[ARRAY]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze <4 x i1> [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[SCALAR_PH]], label %[[VECTOR_PH_SPLIT:.*]]
+; CHECK:       [[VECTOR_PH_SPLIT]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH_SPLIT]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[UNCOUNTABLE_EXIT_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IV_NEXT]], i64 [[ITERS]])
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP3]], <4 x i1> [[UNCOUNTABLE_EXIT_MASK]], <4 x i8> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[ITERS]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[CMP]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[NOT_FOUND_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IV_NEXT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_PH]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT1:%.*]], %[[FOR_INC:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[LD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 [[IV]]
+; CHECK-NEXT:    [[LD:%.*]] = load i8, ptr [[LD_ADDR]], align 1
+; CHECK-NEXT:    [[CMP_EARLY:%.*]] = icmp eq i8 [[LD]], [[ELT]]
+; CHECK-NEXT:    br i1 [[CMP_EARLY]], label %[[FOUND:.*]], label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[NOT_FOUND_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[FOUND]]:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       [[NOT_FOUND_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[NOT_FOUND]]
+; CHECK:       [[NOT_FOUND]]:
+; CHECK-NEXT:    ret i32 0
+;
+
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %array, i64 1), "dereferenceable"(ptr %array, i64 %n) ]
+  %zero.tc = icmp eq i64 %n, 0
+  br i1 %zero.tc, label %not.found, label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %ld.addr = getelementptr inbounds i8, ptr %array, i64 %iv
+  %ld = load i8, ptr %ld.addr
+  %cmp.early = icmp eq i8 %ld, %elt
+  br i1 %cmp.early, label %found, label %for.inc
+
+for.inc:
+  %iv.next = add nsw nuw i64 %iv, 1
+  %cmp = icmp eq i64 %iv.next, %n
+  br i1 %cmp, label %not.found, label %for.body
+
+found:
+  ret i32 1
+
+not.found:
+  ret i32 0
+}
+
+define i64 @simple_contains_live_out_exit_val(ptr align 4 dereferenceable(100) readonly %array, i32 %elt) {
+; CHECK-LABEL: define i64 @simple_contains_live_out_exit_val(
+; CHECK-SAME: ptr readonly align 4 dereferenceable(100) [[ARRAY:%.*]], i32 [[ELT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ELT]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAY]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze <4 x i1> [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH_SPLIT:.*]]
+; CHECK:       [[VECTOR_PH_SPLIT]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH_SPLIT]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    [[UNCOUNTABLE_EXIT_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT]], i64 24)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[UNCOUNTABLE_EXIT_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
+; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 25
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[NOT_FOUND:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[INDEX_NEXT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_PH]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_INC:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV]]
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4
+; CHECK-NEXT:    [[CMP_EARLY:%.*]] = icmp eq i32 [[LD]], [[ELT]]
+; CHECK-NEXT:    br i1 [[CMP_EARLY]], label %[[FOUND:.*]], label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 25
+; CHECK-NEXT:    br i1 [[CMP]], label %[[NOT_FOUND]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[FOUND]]:
+; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    ret i64 [[IV_LCSSA]]
+; CHECK:       [[NOT_FOUND]]:
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_NEXT]], %[[FOR_INC]] ], [ 24, %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[IV_NEXT_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %ld.addr = getelementptr inbounds i32, ptr %array, i64 %iv
+  %ld = load i32, ptr %ld.addr, align 4
+  %cmp.early = icmp eq i32 %ld, %elt
+  br i1 %cmp.early, label %found, label %for.inc
+
+for.inc:
+  %iv.next = add nsw nuw i64 %iv, 1
+  %cmp = icmp eq i64 %iv.next, 25
+  br i1 %cmp, label %not.found, label %for.body
+
+found:
+  ret i64 %iv
+
+not.found:
+  ret i64 %iv.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
index 6954d04f53f04..274ca8fc3ef32 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; REQUIRES: asserts
 ; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 -disable-output 2>&1 | FileCheck %s
+; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 -disable-output -handle-early-exits-in-scalar-tail 2>&1 | FileCheck %s
 
 declare void @init_mem(ptr, i64);
 
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 4fd8d17073de4..571994b51962d 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S < %s -p loop-vectorize -force-vector-width=4 | FileCheck %s
+; RUN: opt -S < %s -p loop-vectorize -force-vector-width=4 -handle-early-exits-in-scalar-tail | FileCheck %s --check-prefix=EE-SCALAR
 
 declare void @init_mem(ptr, i64);
 
@@ -38,6 +39,29 @@ define i64 @same_exit_block_phi_of_consts() {
 ; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
+; EE-SCALAR-LABEL: define i64 @same_exit_block_phi_of_consts() {
+; EE-SCALAR-NEXT:  entry:
+; EE-SCALAR-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; EE-SCALAR-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; EE-SCALAR-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; EE-SCALAR-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; EE-SCALAR-NEXT:    br label [[LOOP:%.*]]
+; EE-SCALAR:       loop:
+; EE-SCALAR-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; EE-SCALAR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; EE-SCALAR-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EE-SCALAR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; EE-SCALAR-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; EE-SCALAR-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; EE-SCALAR-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; EE-SCALAR:       loop.inc:
+; EE-SCALAR-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; EE-SCALAR-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; EE-SCALAR-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; EE-SCALAR:       loop.end:
+; EE-SCALAR-NEXT:    [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ]
+; EE-SCALAR-NEXT:    ret i64 [[RETVAL]]
+;
 entry:
   %p1 = alloca [1024 x i8]
   %p2 = alloca [1024 x i8]
@@ -100,6 +124,30 @@ define i64 @diff_exit_block_phi_of_consts() {
 ; CHECK:       loop.end:
 ; CHECK-NEXT:    ret i64 1
 ;
+; EE-SCALAR-LABEL: define i64 @diff_exit_block_phi_of_consts() {
+; EE-SCALAR-NEXT:  entry:
+; EE-SCALAR-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; EE-SCALAR-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; EE-SCALAR-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; EE-SCALAR-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; EE-SCALAR-NEXT:    br label [[LOOP:%.*]]
+; EE-SCALAR:       loop:
+; EE-SCALAR-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; EE-SCALAR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; EE-SCALAR-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EE-SCALAR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; EE-SCALAR-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; EE-SCALAR-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; EE-SCALAR-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; EE-SCALAR:       loop.inc:
+; EE-SCALAR-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; EE-SCALAR-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; EE-SCALAR-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; EE-SCALAR:       loop.early.exit:
+; EE-SCALAR-NEXT:    ret i64 0
+; EE-SCALAR:       loop.end:
+; EE-SCALAR-NEXT:    ret i64 1
+;
 entry:
   %p1 = alloca [1024 x i8]
   %p2 = alloca [1024 x i8]
@@ -202,6 +250,35 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i32 0
 ;
+; EE-SCALAR-LABEL: define i32 @diff_exit_block_needs_scev_check(
+; EE-SCALAR-SAME: i32 [[END:%.*]]) {
+; EE-SCALAR-NEXT:  entry:
+; EE-SCALAR-NEXT:    [[P1:%.*]] = alloca [1024 x i32], align 4
+; EE-SCALAR-NEXT:    [[P2:%.*]] = alloca [1024 x i32], align 4
+; EE-SCALAR-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; EE-SCALAR-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; EE-SCALAR-NEXT:    [[END_CLAMPED:%.*]] = and i32 [[END]], 1023
+; EE-SCALAR-NEXT:    br label [[FOR_BODY:%.*]]
+; EE-SCALAR:       for.body:
+; EE-SCALAR-NEXT:    [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; EE-SCALAR-NEXT:    [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ]
+; EE-SCALAR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]]
+; EE-SCALAR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; EE-SCALAR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]]
+; EE-SCALAR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; EE-SCALAR-NEXT:    [[CMP_EARLY:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; EE-SCALAR-NEXT:    br i1 [[CMP_EARLY]], label [[FOUND:%.*]], label [[FOR_INC]]
+; EE-SCALAR:       for.inc:
+; EE-SCALAR-NEXT:    [[IND_NEXT]] = add i8 [[IND]], 1
+; EE-SCALAR-NEXT:    [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32
+; EE-SCALAR-NEXT:    [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1
+; EE-SCALAR-NEXT:    [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]]
+; EE-SCALAR-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; EE-SCALAR:       found:
+; EE-SCALAR-NEXT:    ret i32 1
+; EE-SCALAR:       exit:
+; EE-SCALAR-NEXT:    ret i32 0
+;
 entry:
   %p1 = alloca [1024 x i32]
   %p2 = alloca [1024 x i32]
@@ -271,6 +348,25 @@ define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) {
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 0
 ;
+; EE-SCALAR-LABEL: define i32 @diff_blocks_invariant_early_exit_cond(
+; EE-SCALAR-SAME: ptr [[S:%.*]]) {
+; EE-SCALAR-NEXT:  entry:
+; EE-SCALAR-NEXT:    [[SVAL:%.*]] = load i32, ptr [[S]], align 4
+; EE-SCALAR-NEXT:    [[COND:%.*]] = icmp eq i32 [[SVAL]], 0
+; EE-SCALAR-NEXT:    br label [[FOR_BODY:%.*]]
+; EE-SCALAR:       for.body:
+; EE-SCALAR-NEXT:    [[IND:%.*]] = phi i32 [ -10, [[ENTRY:%.*]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; EE-SCALAR-NEXT:    br i1 [[COND]], label [[FOR_INC]], label [[EARLY_EXIT:%.*]]
+; EE-SCALAR:       for.inc:
+; EE-SCALAR-NEXT:    [[IND_NEXT]] = add nsw i32 [[IND]], 1
+; EE-SCALAR-NEXT:    [[EC:%.*]] = icmp eq i32 [[IND_NEXT]], 266
+; EE-SCALAR-NEXT:    br i1 [[EC]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; EE-SCALAR:       early.exit:
+; EE-SCALAR-NEXT:    tail call void @abort()
+; EE-SCALAR-NEXT:    unreachable
+; EE-SCALAR:       for.end:
+; EE-SCALAR-NEXT:    ret i32 0
+;
 entry:
   %sval = load i32, ptr %s, align 4
   %cond = icmp eq i32 %sval, 0
@@ -355,6 +451,70 @@ define void @inner_loop_trip_count_depends_on_outer_iv(ptr align 8 dereferenceab
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
+; EE-SCALAR-LABEL: define void @inner_loop_trip_count_depends_on_outer_iv(
+; EE-SCALAR-SAME: ptr align 8 dereferenceable(1792) [[THIS:%.*]], ptr [[DST:%.*]]) {
+; EE-SCALAR-NEXT:  entry:
+; EE-SCALAR-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[THIS]], i64 1000
+; EE-SCALAR-NEXT:    br label [[OUTER_HEADER:%.*]]
+; EE-SCALAR:       outer.header:
+; EE-SCALAR-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ]
+; EE-SCALAR-NEXT:    [[C_1:%.*]] = icmp eq i64 [[OUTER_IV]], 0
+; EE-SCALAR-NEXT:    br i1 [[C_1]], label [[THEN:%.*]], label [[INNER_HEADER_PREHEADER:%.*]]
+; EE-SCALAR:       inner.header.preheader:
+; EE-SCALAR-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[OUTER_IV]], 4
+; EE-SCALAR-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EE-SCALAR:       vector.ph:
+; EE-SCALAR-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[OUTER_IV]], 4
+; EE-SCALAR-NEXT:    [[N_VEC:%.*]] = sub i64 [[OUTER_IV]], [[N_MOD_VF]]
+; EE-SCALAR-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x ptr>, ptr [[GEP_SRC]], align 8
+; EE-SCALAR-NEXT:    [[TMP0:%.*]] = icmp eq <4 x ptr> [[WIDE_LOAD]], zeroinitializer
+; EE-SCALAR-NEXT:    [[TMP1:%.*]] = freeze <4 x i1> [[TMP0]]
+; EE-SCALAR-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
+; EE-SCALAR-NEXT:    br i1 [[TMP2]], label [[SCALAR_PH]], label [[VECTOR_PH_SPLIT:%.*]]
+; EE-SCALAR:       vector.ph.split:
+; EE-SCALAR-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EE-SCALAR:       vector.body:
+; EE-SCALAR-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_SPLIT]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EE-SCALAR-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; EE-SCALAR-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr [[GEP_SRC]], i64 [[INDEX_NEXT]]
+; EE-SCALAR-NEXT:    [[UNCOUNTABLE_EXIT_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT]], i64 [[N_VEC]])
+; EE-SCALAR-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP3]], <4 x i1> [[UNCOUNTABLE_EXIT_MASK]], <4 x ptr> poison)
+; EE-SCALAR-NEXT:    [[TMP4:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
+; EE-SCALAR-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; EE-SCALAR-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; EE-SCALAR-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EE-SCALAR-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; EE-SCALAR-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EE-SCALAR:       middle.block:
+; EE-SCALAR-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[OUTER_IV]]
+; EE-SCALAR-NEXT:    br i1 [[TMP9]], label [[OUTER_LATCH_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EE-SCALAR:       scalar.ph:
+; EE-SCALAR-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[MIDDLE_BLOCK]] ], [ 0, [[INNER_HEADER_PREHEADER]] ], [ 0, [[VECTOR_PH]] ]
+; EE-SCALAR-NEXT:    br label [[INNER_HEADER:%.*]]
+; EE-SCALAR:       inner.header:
+; EE-SCALAR-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[INNER_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; EE-SCALAR-NEXT:    [[GEP_IV:%.*]] = getelementptr ptr, ptr [[GEP_SRC]], i64 [[IV]]
+; EE-SCALAR-NEXT:    [[L:%.*]] = load ptr, ptr [[GEP_IV]], align 8
+; EE-SCALAR-NEXT:    [[C_2:%.*]] = icmp eq ptr [[L]], null
+; EE-SCALAR-NEXT:    br i1 [[C_2]], label [[THEN_LOOPEXIT:%.*]], label [[INNER_LATCH]]
+; EE-SCALAR:       inner.latch:
+; EE-SCALAR-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; EE-SCALAR-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[OUTER_IV]]
+; EE-SCALAR-NEXT:    br i1 [[EC]], label [[OUTER_LATCH_LOOPEXIT]], label [[INNER_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; EE-SCALAR:       then.loopexit:
+; EE-SCALAR-NEXT:    br label [[THEN]]
+; EE-SCALAR:       then:
+; EE-SCALAR-NEXT:    store i32 0, ptr [[DST]], align 4
+; EE-SCALAR-NEXT:    br label [[OUTER_LATCH]]
+; EE-SCALAR:       outer.latch.loopexit:
+; EE-SCALAR-NEXT:    br label [[OUTER_LATCH]]
+; EE-SCALAR:       outer.latch:
+; EE-SCALAR-NEXT:    [[OUTER_IV_NEXT]] = add i64 [[OUTER_IV]], 1
+; EE-SCALAR-NEXT:    [[OUTER_EC:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], 100
+; EE-SCALAR-NEXT:    br i1 [[OUTER_EC]], label [[EXIT:%.*]], label [[OUTER_HEADER]]
+; EE-SCALAR:       exit:
+; EE-SCALAR-NEXT:    ret void
+;
 entry:
   %gep.src = getelementptr i8, ptr %this, i64 1000
   br label %outer.header
@@ -447,6 +607,34 @@ define i64 @loop_guard_needed_to_prove_dereferenceable(i32 %x, i1 %cmp2) {
 ; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[RES_PH]], [[EXIT_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
+; EE-SCALAR-LABEL: define i64 @loop_guard_needed_to_prove_dereferenceable(
+; EE-SCALAR-SAME: i32 [[X:%.*]], i1 [[CMP2:%.*]]) {
+; EE-SCALAR-NEXT:  entry:
+; EE-SCALAR-NEXT:    [[A:%.*]] = alloca [32 x i32], align 4
+; EE-SCALAR-NEXT:    call void @init_mem(ptr [[A]], i64 128)
+; EE-SCALAR-NEXT:    [[C_X:%.*]] = icmp sgt i32 [[X]], 0
+; EE-SCALAR-NEXT:    br i1 [[C_X]], label [[PH:%.*]], label [[EXIT:%.*]]
+; EE-SCALAR:       ph:
+; EE-SCALAR-NEXT:    [[N:%.*]] = tail call i32 @llvm.smin.i32(i32 [[X]], i32 31)
+; EE-SCALAR-NEXT:    [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; EE-SCALAR-NEXT:    br label [[LOOP_HEADER:%.*]]
+; EE-SCALAR:       loop.header:
+; EE-SCALAR-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; EE-SCALAR-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr [32 x i32], ptr [[A]], i64 0, i64 [[IV]]
+; EE-SCALAR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX42]], align 4
+; EE-SCALAR-NEXT:    [[CMP43:%.*]] = icmp eq i32 [[TMP0]], 0
+; EE-SCALAR-NEXT:    br i1 [[CMP43]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_LATCH]]
+; EE-SCALAR:       loop.latch:
+; EE-SCALAR-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; EE-SCALAR-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N_EXT]]
+; EE-SCALAR-NEXT:    br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER]]
+; EE-SCALAR:       exit.loopexit:
+; EE-SCALAR-NEXT:    [[RES_PH:%.*]] = phi i64 [ [[IV]], [[LOOP_HEADER]] ], [ -1, [[LOOP_LATCH]] ]
+; EE-SCALAR-NEXT:    br label [[EXIT]]
+; EE-SCALAR:       exit:
+; EE-SCALAR-NEXT:    [[RES:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[RES_PH]], [[EXIT_LOOPEXIT]] ]
+; EE-SCALAR-NEXT:    ret i64 [[RES]]
+;
 entry:
   %A = alloca [32 x i32], align 4
   call void @init_mem(ptr %A, i64 128)
@@ -504,6 +692,30 @@ define ptr @btc_and_max_btc_require_predicates(ptr noalias %start, i64 %offset)
 ; CHECK-NEXT:    [[RES:%.*]] = phi ptr [ [[IV_1]], [[LOOP_HEADER]] ], [ [[IV_2]], [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret ptr [[RES]]
 ;
+; EE-SCALAR-LABEL: define ptr @btc_and_max_btc_require_predicates(
+; EE-SCALAR-SAME: ptr noalias [[START:%.*]], i64 [[OFFSET:%.*]]) {
+; EE-SCALAR-NEXT:  entry:
+; EE-SCALAR-NEXT:    [[END:%.*]] = getelementptr i32, ptr [[START]], i64 [[OFFSET]]
+; EE-SCALAR-NEXT:    [[PRE_1:%.*]] = icmp ult i64 [[OFFSET]], 100
+; EE-SCALAR-NEXT:    call void @llvm.assume(i1 [[PRE_1]])
+; EE-SCALAR-NEXT:    [[PRE_2:%.*]] = icmp ugt i64 [[OFFSET]], 1
+; EE-SCALAR-NEXT:    call void @llvm.assume(i1 [[PRE_2]])
+; EE-SCALAR-NEXT:    br label [[LOOP_HEADER:%.*]]
+; EE-SCALAR:       loop.header:
+; EE-SCALAR-NEXT:    [[IV_1:%.*]] = phi ptr [ @A, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; EE-SCALAR-NEXT:    [[IV_2:%.*]] = phi ptr [ [[START]], [[ENTRY]] ], [ [[IV_2_NEXT:%.*]], [[LOOP_LATCH]] ]
+; EE-SCALAR-NEXT:    [[L:%.*]] = load i32, ptr [[IV_1]], align 4
+; EE-SCALAR-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 0
+; EE-SCALAR-NEXT:    br i1 [[C]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
+; EE-SCALAR:       loop.latch:
+; EE-SCALAR-NEXT:    [[IV_2_NEXT]] = getelementptr i8, ptr [[IV_2]], i64 40
+; EE-SCALAR-NEXT:    [[IV_1_NEXT]] = getelementptr i8, ptr [[IV_1]], i64 40
+; EE-SCALAR-NEXT:    [[EC:%.*]] = icmp eq ptr [[IV_2]], [[END]]
+; EE-SCALAR-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]]
+; EE-SCALAR:       exit:
+; EE-SCALAR-NEXT:    [[RES:%.*]] = phi ptr [ [[IV_1]], [[LOOP_HEADER]] ], [ [[IV_2]], [[LOOP_LATCH]] ]
+; EE-SCALAR-NEXT:    ret ptr [[RES]]
+;
 entry:
   %end = getelementptr i32, ptr %start, i64 %offset
   %pre.1 = icmp ult i64 %offset, 100
@@ -595,6 +807,71 @@ define i64 @loop_guards_needed_to_prove_deref_multiple(i32 %x, i1 %c, ptr derefe
 ; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ -1, [[ENTRY:%.*]] ], [ -2, [[THEN]] ], [ [[RES_PH]], [[EXIT_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
+; EE-SCALAR-LABEL: define i64 @loop_guards_needed_to_prove_deref_multiple(
+; EE-SCALAR-SAME: i32 [[X:%.*]], i1 [[C:%.*]], ptr dereferenceable(1024) [[SRC:%.*]]) {
+; EE-SCALAR-NEXT:  entry:
+; EE-SCALAR-NEXT:    [[X_AND:%.*]] = and i32 [[X]], -2
+; EE-SCALAR-NEXT:    [[PRE_0:%.*]] = icmp eq i32 [[X]], 0
+; EE-SCALAR-NEXT:    br i1 [[PRE_0]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; EE-SCALAR:       then:
+; EE-SCALAR-NEXT:    [[SEL:%.*]] = select i1 [[C]], i32 [[X_AND]], i32 0
+; EE-SCALAR-NEXT:    [[PRE_1:%.*]] = icmp ugt i32 [[SEL]], 1024
+; EE-SCALAR-NEXT:    br i1 [[PRE_1]], label [[EXIT]], label [[PH:%.*]]
+; EE-SCALAR:       ph:
+; EE-SCALAR-NEXT:    [[PRE_2:%.*]] = icmp ne i32 [[SEL]], 0
+; EE-SCALAR-NEXT:    call void @llvm.assume(i1 [[PRE_2]])
+; EE-SCALAR-NEXT:    [[N:%.*]] = add i32 [[SEL]], -1
+; EE-SCALAR-NEXT:    [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; EE-SCALAR-NEXT:    [[TMP0:%.*]] = add i32 [[SEL]], -2
+; EE-SCALAR-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; EE-SCALAR-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 2
+; EE-SCALAR-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; EE-SCALAR-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EE-SCALAR:       vector.ph:
+; EE-SCALAR-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; EE-SCALAR-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; EE-SCALAR-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[SRC]], align 1
+; EE-SCALAR-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; EE-SCALAR-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
+; EE-SCALAR-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; EE-SCALAR-NEXT:    br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH_SPLIT:%.*]]
+; EE-SCALAR:       vector.ph.split:
+; EE-SCALAR-NEXT:    br label [[VECTOR_BODY:%.*]]
+; EE-SCALAR:       vector.body:
+; EE-SCALAR-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_SPLIT]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EE-SCALAR-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; EE-SCALAR-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX_NEXT]]
+; EE-SCALAR-NEXT:    [[UNCOUNTABLE_EXIT_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT]], i64 [[N_VEC]])
+; EE-SCALAR-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP6]], <4 x i1> [[UNCOUNTABLE_EXIT_MASK]], <4 x i8> poison)
+; EE-SCALAR-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; EE-SCALAR-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
+; EE-SCALAR-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
+; EE-SCALAR-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EE-SCALAR-NEXT:    [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]]
+; EE-SCALAR-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; EE-SCALAR:       middle.block:
+; EE-SCALAR-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP2]]
+; EE-SCALAR-NEXT:    br i1 [[TMP12]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EE-SCALAR:       scalar.ph:
+; EE-SCALAR-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[MIDDLE_BLOCK]] ], [ 0, [[PH]] ], [ 0, [[VECTOR_PH]] ]
+; EE-SCALAR-NEXT:    br label [[LOOP_HEADER:%.*]]
+; EE-SCALAR:       loop.header:
+; EE-SCALAR-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; EE-SCALAR-NEXT:    [[GEP_SRC_I:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; EE-SCALAR-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC_I]], align 1
+; EE-SCALAR-NEXT:    [[C_1:%.*]] = icmp eq i8 [[L]], 0
+; EE-SCALAR-NEXT:    br i1 [[C_1]], label [[EXIT_LOOPEXIT]], label [[LOOP_LATCH]]
+; EE-SCALAR:       loop.latch:
+; EE-SCALAR-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; EE-SCALAR-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N_EXT]]
+; EE-SCALAR-NEXT:    br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; EE-SCALAR:       exit.loopexit:
+; EE-SCALAR-NEXT:    [[RES_PH:%.*]] = phi i64 [ [[IV]], [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ], [ 0, [[MIDDLE_BLOCK]] ]
+; EE-SCALAR-NEXT:    br label [[EXIT]]
+; EE-SCALAR:       exit:
+; EE-SCALAR-NEXT:    [[RES:%.*]] = phi i64 [ -1, [[ENTRY:%.*]] ], [ -2, [[THEN]] ], [ [[RES_PH]], [[EXIT_LOOPEXIT]] ]
+; EE-SCALAR-NEXT:    ret i64 [[RES]]
+;
 entry:
   %x.and = and i32 %x, -2
   %pre.0 = icmp eq i32 %x, 0
@@ -643,3 +920,10 @@ exit:
 ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
 ;.
+; EE-SCALAR: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; EE-SCALAR: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; EE-SCALAR: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; EE-SCALAR: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; EE-SCALAR: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; EE-SCALAR: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
index 61ed49296070e..3555d7e55917f 100644
--- a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S -debug %s 2>&1 | FileCheck %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -handle-early-exits-in-scalar-tail -S -debug %s 2>&1 | FileCheck %s --check-prefix=EE-SCALAR
 
 ; REQUIRES: asserts
 
@@ -66,6 +67,71 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; CHECK-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph)
 ; CHECK:      No successors
 ; CHECK-NEXT: }
+
+; EE-SCALAR-LABEL: multi_exiting_to_different_exits_live_in_exit_values
+; EE-SCALAR: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; EE-SCALAR-NEXT: Live-in vp<%0> = VF
+; EE-SCALAR-NEXT: Live-in vp<%1> = VF * UF
+; EE-SCALAR-NEXT: Live-in vp<%2> = vector-trip-count
+; EE-SCALAR-NEXT: Live-in ir<128> = original trip-count
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: ir-bb<entry>:
+; EE-SCALAR-NEXT:   IR   %src = alloca [128 x i32], align 4
+; EE-SCALAR-NEXT:   IR   call void @init(ptr %src)
+; EE-SCALAR-NEXT: Successor(s): scalar.ph, vector.ph
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: vector.ph:
+; EE-SCALAR-NEXT:   CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, ir<0>
+; EE-SCALAR-NEXT:   vp<%3> = vector-pointer ir<%gep.src>
+; EE-SCALAR-NEXT:   WIDEN ir<%l> = load vp<%3>
+; EE-SCALAR-NEXT:   WIDEN ir<%c.1> = icmp eq ir<%l>, ir<10>
+; EE-SCALAR-NEXT:   EMIT vp<%4> = any-of ir<%c.1>
+; EE-SCALAR-NEXT:   EMIT branch-on-cond vp<%4>
+; EE-SCALAR-NEXT: Successor(s): scalar.ph, vector.ph.split
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: vector.ph.split:
+; EE-SCALAR-NEXT: Successor(s): vector loop
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: <x1> vector loop: {
+; EE-SCALAR-NEXT:   vector.body:
+; EE-SCALAR-NEXT:     EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; EE-SCALAR-NEXT:     vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
+; EE-SCALAR-NEXT:     EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; EE-SCALAR-NEXT:     CLONE ir<%gep.src>.1 = getelementptr inbounds ir<%src>, vp<%index.next>
+; EE-SCALAR-NEXT:     vp<%8> = vector-pointer ir<%gep.src>.1
+; EE-SCALAR-NEXT:     EMIT vp<%uncountable.exit.mask> = active lane mask vp<%index.next>, vp<%2>, ir<1>
+; EE-SCALAR-NEXT:     WIDEN ir<%l>.1 = load vp<%8>, vp<%uncountable.exit.mask>
+; EE-SCALAR-NEXT:     WIDEN ir<%c.1>.1 = icmp eq ir<%l>.1, ir<10>
+; EE-SCALAR-NEXT:     EMIT vp<%9> = any-of ir<%c.1>.1
+; EE-SCALAR-NEXT:     EMIT vp<%10> = icmp eq vp<%index.next>, vp<%2>
+; EE-SCALAR-NEXT:     EMIT vp<%11> = or vp<%9>, vp<%10>
+; EE-SCALAR-NEXT:     EMIT branch-on-cond vp<%11>
+; EE-SCALAR-NEXT:   No successors
+; EE-SCALAR-NEXT: }
+; EE-SCALAR-NEXT: Successor(s): middle.block
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: middle.block:
+; EE-SCALAR-NEXT:   EMIT vp<%cmp.n> = icmp eq ir<128>, vp<%2>
+; EE-SCALAR-NEXT:   EMIT vp<%13> = icmp eq vp<%index.next>, ir<128>
+; EE-SCALAR-NEXT:   EMIT branch-on-cond vp<%13>
+; EE-SCALAR-NEXT: Successor(s): ir-bb<e2>, scalar.ph
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: ir-bb<e2>:
+; EE-SCALAR-NEXT:   IR   %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1> from middle.block)
+; EE-SCALAR-NEXT: No successors
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: scalar.ph:
+; EE-SCALAR-NEXT:   EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%index.next>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, vector.ph ]
+; EE-SCALAR-NEXT: Successor(s): ir-bb<loop.header>
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: ir-bb<loop.header>:
+; EE-SCALAR-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; EE-SCALAR-NEXT:   IR   %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+; EE-SCALAR-NEXT:   IR   %l = load i32, ptr %gep.src, align 4
+; EE-SCALAR-NEXT:   IR   %c.1 = icmp eq i32 %l, 10
+; EE-SCALAR-NEXT: No successors
+; EE-SCALAR-NEXT:}
+
 entry:
   %src = alloca [128 x i32]
   call void @init(ptr %src)
@@ -150,6 +216,71 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values() {
 ; CHECK:      No successors
 ; CHECK-NEXT: }
 
+; EE-SCALAR-LABEL: multi_exiting_to_same_exit_live_in_exit_values
+; EE-SCALAR: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; EE-SCALAR-NEXT: Live-in vp<%0> = VF
+; EE-SCALAR-NEXT: Live-in vp<%1> = VF * UF
+; EE-SCALAR-NEXT: Live-in vp<%2> = vector-trip-count
+; EE-SCALAR-NEXT: Live-in ir<128> = original trip-count
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: ir-bb<entry>:
+; EE-SCALAR-NEXT:   IR   %src = alloca [128 x i32], align 4
+; EE-SCALAR-NEXT:   IR   call void @init(ptr %src)
+; EE-SCALAR-NEXT: Successor(s): scalar.ph, vector.ph
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: vector.ph:
+; EE-SCALAR-NEXT:   CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, ir<0>
+; EE-SCALAR-NEXT:   vp<%3> = vector-pointer ir<%gep.src>
+; EE-SCALAR-NEXT:   WIDEN ir<%l> = load vp<%3>
+; EE-SCALAR-NEXT:   WIDEN ir<%c.1> = icmp eq ir<%l>, ir<10>
+; EE-SCALAR-NEXT:   EMIT vp<%4> = any-of ir<%c.1>
+; EE-SCALAR-NEXT:   EMIT branch-on-cond vp<%4>
+; EE-SCALAR-NEXT: Successor(s): scalar.ph, vector.ph.split
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: vector.ph.split:
+; EE-SCALAR-NEXT: Successor(s): vector loop
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: <x1> vector loop: {
+; EE-SCALAR-NEXT:   vector.body:
+; EE-SCALAR-NEXT:     EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; EE-SCALAR-NEXT:     vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
+; EE-SCALAR-NEXT:     EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; EE-SCALAR-NEXT:     CLONE ir<%gep.src>.1 = getelementptr inbounds ir<%src>, vp<%index.next>
+; EE-SCALAR-NEXT:     vp<%8> = vector-pointer ir<%gep.src>.1
+; EE-SCALAR-NEXT:     EMIT vp<%uncountable.exit.mask> = active lane mask vp<%index.next>, vp<%2>, ir<1>
+; EE-SCALAR-NEXT:     WIDEN ir<%l>.1 = load vp<%8>, vp<%uncountable.exit.mask>
+; EE-SCALAR-NEXT:     WIDEN ir<%c.1>.1 = icmp eq ir<%l>.1, ir<10>
+; EE-SCALAR-NEXT:     EMIT vp<%9> = any-of ir<%c.1>.1
+; EE-SCALAR-NEXT:     EMIT vp<%10> = icmp eq vp<%index.next>, vp<%2>
+; EE-SCALAR-NEXT:     EMIT vp<%11> = or vp<%9>, vp<%10>
+; EE-SCALAR-NEXT:     EMIT branch-on-cond vp<%11>
+; EE-SCALAR-NEXT:   No successors
+; EE-SCALAR-NEXT: }
+; EE-SCALAR-NEXT: Successor(s): middle.block
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: middle.block:
+; EE-SCALAR-NEXT:   EMIT vp<%cmp.n> = icmp eq ir<128>, vp<%2>
+; EE-SCALAR-NEXT:   EMIT vp<%13> = icmp eq vp<%index.next>, ir<128>
+; EE-SCALAR-NEXT:   EMIT branch-on-cond vp<%13>
+; EE-SCALAR-NEXT: Successor(s): ir-bb<exit>, scalar.ph
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: ir-bb<exit>:
+; EE-SCALAR-NEXT:   IR   %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<1> from middle.block)
+; EE-SCALAR-NEXT: No successors
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: scalar.ph:
+; EE-SCALAR-NEXT:   EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%index.next>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, vector.ph ]
+; EE-SCALAR-NEXT: Successor(s): ir-bb<loop.header>
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: ir-bb<loop.header>:
+; EE-SCALAR-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; EE-SCALAR-NEXT:   IR   %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+; EE-SCALAR-NEXT:   IR   %l = load i32, ptr %gep.src, align 4
+; EE-SCALAR-NEXT:   IR   %c.1 = icmp eq i32 %l, 10
+; EE-SCALAR-NEXT: No successors
+; EE-SCALAR-NEXT: }
+
+
 entry:
   %src = alloca [128 x i32]
   call void @init(ptr %src)
@@ -230,6 +361,70 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values_2() {
 ; CHECK:      No successors
 ; CHECK-NEXT: }
 
+; EE-SCALAR-LABEL: multi_exiting_to_same_exit_live_in_exit_values_2
+; EE-SCALAR: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; EE-SCALAR-NEXT: Live-in vp<%0> = VF
+; EE-SCALAR-NEXT: Live-in vp<%1> = VF * UF
+; EE-SCALAR-NEXT: Live-in vp<%2> = vector-trip-count
+; EE-SCALAR-NEXT: Live-in ir<128> = original trip-count
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: ir-bb<entry>:
+; EE-SCALAR-NEXT:   IR   %src = alloca [128 x i32], align 4
+; EE-SCALAR-NEXT:   IR   call void @init(ptr %src)
+; EE-SCALAR-NEXT: Successor(s): scalar.ph, vector.ph
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: vector.ph:
+; EE-SCALAR-NEXT:   CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, ir<0>
+; EE-SCALAR-NEXT:   vp<%3> = vector-pointer ir<%gep.src>
+; EE-SCALAR-NEXT:   WIDEN ir<%l> = load vp<%3>
+; EE-SCALAR-NEXT:   WIDEN ir<%c.1> = icmp eq ir<%l>, ir<10>
+; EE-SCALAR-NEXT:   EMIT vp<%4> = any-of ir<%c.1>
+; EE-SCALAR-NEXT:   EMIT branch-on-cond vp<%4>
+; EE-SCALAR-NEXT: Successor(s): scalar.ph, vector.ph.split
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: vector.ph.split:
+; EE-SCALAR-NEXT: Successor(s): vector loop
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: <x1> vector loop: {
+; EE-SCALAR-NEXT:   vector.body:
+; EE-SCALAR-NEXT:     EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; EE-SCALAR-NEXT:     vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0>
+; EE-SCALAR-NEXT:     EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; EE-SCALAR-NEXT:     CLONE ir<%gep.src>.1 = getelementptr inbounds ir<%src>, vp<%index.next>
+; EE-SCALAR-NEXT:     vp<%8> = vector-pointer ir<%gep.src>.1
+; EE-SCALAR-NEXT:     EMIT vp<%uncountable.exit.mask> = active lane mask vp<%index.next>, vp<%2>, ir<1>
+; EE-SCALAR-NEXT:     WIDEN ir<%l>.1 = load vp<%8>, vp<%uncountable.exit.mask>
+; EE-SCALAR-NEXT:     WIDEN ir<%c.1>.1 = icmp eq ir<%l>.1, ir<10>
+; EE-SCALAR-NEXT:     EMIT vp<%9> = any-of ir<%c.1>.1
+; EE-SCALAR-NEXT:     EMIT vp<%10> = icmp eq vp<%index.next>, vp<%2>
+; EE-SCALAR-NEXT:     EMIT vp<%11> = or vp<%9>, vp<%10>
+; EE-SCALAR-NEXT:     EMIT branch-on-cond vp<%11>
+; EE-SCALAR-NEXT:   No successors
+; EE-SCALAR-NEXT: }
+; EE-SCALAR-NEXT: Successor(s): middle.block
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: middle.block:
+; EE-SCALAR-NEXT:   EMIT vp<%cmp.n> = icmp eq ir<128>, vp<%2>
+; EE-SCALAR-NEXT:   EMIT vp<%13> = icmp eq vp<%index.next>, ir<128>
+; EE-SCALAR-NEXT:   EMIT branch-on-cond vp<%13>
+; EE-SCALAR-NEXT: Successor(s): ir-bb<exit>, scalar.ph
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: ir-bb<exit>:
+; EE-SCALAR-NEXT:   IR   %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<1> from middle.block)
+; EE-SCALAR-NEXT: No successors
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: scalar.ph:
+; EE-SCALAR-NEXT:   EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%index.next>, middle.block ], [ ir<0>, ir-bb<entry> ], [ ir<0>, vector.ph ]
+; EE-SCALAR-NEXT: Successor(s): ir-bb<loop.header>
+; EE-SCALAR-EMPTY:
+; EE-SCALAR-NEXT: ir-bb<loop.header>:
+; EE-SCALAR-NEXT:   IR   %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; EE-SCALAR-NEXT:   IR   %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+; EE-SCALAR-NEXT:   IR   %l = load i32, ptr %gep.src, align 4
+; EE-SCALAR-NEXT:   IR   %c.1 = icmp eq i32 %l, 10
+; EE-SCALAR-NEXT: No successors
+; EE-SCALAR-NEXT:}
+
 entry:
   %src = alloca [128 x i32]
   call void @init(ptr %src)



More information about the llvm-commits mailing list