[llvm] [LoopVectorize] Add support for vectorisation of simple early exit loops (PR #88385)

Wed May 1 01:34:31 PDT 2024

https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/88385

>From 9347bf4312fdddfbbea1479c5be9870fd021edf8 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 11 Apr 2024 11:50:50 +0000
Subject: [PATCH 1/5] [LoopVectorize] Add support for vectorisation of simple
 early exit loops

This patch adds support for vectorisation of a simple class of loops
that typically involves searching for something, i.e.

  for (int i = 0; i < n; i++) {
    if (p[i] == val)
      return i;
  }
  return n;

or

  for (int i = 0; i < n; i++) {
    if (p1[i] != p2[i])
      return i;
  }
  return n;

In this initial commit we only vectorise loops with the following
criteria:

1. There are no stores in the loop.
2. The loop must have only one early exit like those shown in the
above example. I have referred to such exits as speculative early
exits, to distinguish from existing support for early exits where
the exit-not-taken count is known exactly at compile time.
2. The early exit block dominates the latch block.
3. There are no loads after the early exit block.
4. The loop must not contain reductions or recurrences. I don't
see anything fundamental blocking vectorisation of such loops, but
I just haven't done the work to support them yet.
5. We must be able to prove at compile-time that loops will not
contain faulting loads.

For point 5 once this patch lands I intend to follow up by supporting
some limited cases of faulting loops where we can version the loop
based on pointer alignment. For example, it turns out in the
SPEC2017 benchmark there is a std::find loop that we can vectorise
provided we add SCEV checks for the initial pointer being aligned
to a multiple of the VF. In practice, the pointer is regularly
aligned to at least 32/64 bytes and since the VF is a power of 2, any
vector loads <= 32/64 bytes in size will always fault on the first
lane, following the same behaviour as the scalar loop. Given we
already do such speculative versioning for loops with unknown strides,
alignment-based versioning doesn't seem to be any worse.

This patch makes use of the existing experimental_cttz_elems intrinsic
that's required in the vectorised early exit block to determine the
first lane that triggered the exit. This intrinsic has generic
lowering support so it's guaranteed to work for all targets.

Tests have been added here:

  Transforms/LoopVectorize/AArch64/simple_early_exit.ll
---
 .../llvm/Analysis/LoopAccessAnalysis.h        |   36 +
 llvm/include/llvm/Analysis/ScalarEvolution.h  |   36 +-
 llvm/include/llvm/IR/IRBuilder.h              |    7 +
 llvm/include/llvm/Support/GenericLoopInfo.h   |    4 +
 .../llvm/Support/GenericLoopInfoImpl.h        |   10 +
 .../Utils/ScalarEvolutionExpander.h           |    9 +-
 .../Vectorize/LoopVectorizationLegality.h     |   18 +
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |  189 +-
 llvm/lib/Analysis/ScalarEvolution.cpp         |   94 +-
 .../Utils/ScalarEvolutionExpander.cpp         |    4 +-
 .../Vectorize/LoopVectorizationLegality.cpp   |   10 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |  384 ++-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   68 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   78 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   49 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |    4 +-
 .../AArch64/simple_early_exit.ll              | 2544 +++++++++++++++++
 .../Transforms/LoopVectorize/control-flow.ll  |    2 +-
 18 files changed, 3459 insertions(+), 87 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index e39c371b41ec5c..d79c53f490c927 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -587,6 +587,9 @@ class LoopAccessInfo {
   /// not legal to insert them.
   bool hasConvergentOp() const { return HasConvergentOp; }
 
+  /// Return true if the loop may fault due to memory accesses.
+  bool mayFault() const { return LoopMayFault; }
+
   const RuntimePointerChecking *getRuntimePointerChecking() const {
     return PtrRtChecking.get();
   }
@@ -608,6 +611,24 @@ class LoopAccessInfo {
   unsigned getNumStores() const { return NumStores; }
   unsigned getNumLoads() const { return NumLoads;}
 
+  /// Returns the block that exits early from the loop, if there is one.
+  /// Otherwise returns nullptr.
+  BasicBlock *getSpeculativeEarlyExitingBlock() const {
+    return SpeculativeEarlyExitingBB;
+  }
+
+  /// Returns the successor of the block that exits early from the loop, if
+  /// there is one. Otherwise returns nullptr.
+  BasicBlock *getSpeculativeEarlyExitBlock() const {
+    return SpeculativeEarlyExitBB;
+  }
+
+  /// Returns all blocks with a countable exit, i.e. the exit-not-taken count
+  /// is known exactly at compile time.
+  const SmallVector<BasicBlock *, 4> &getCountableEarlyExitingBlocks() const {
+    return CountableEarlyExitBlocks;
+  }
+
   /// The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
   const OptimizationRemarkAnalysis *getReport() const { return Report.get(); }
@@ -659,6 +680,10 @@ class LoopAccessInfo {
   /// pass.
   bool canAnalyzeLoop();
 
+  /// Returns true if this is a supported early exit loop that we can analyze
+  /// in this pass.
+  bool isAnalyzableEarlyExitLoop();
+
   /// Save the analysis remark.
   ///
   /// LAA does not directly emits the remarks.  Instead it stores it which the
@@ -696,6 +721,17 @@ class LoopAccessInfo {
   /// Cache the result of analyzeLoop.
   bool CanVecMem = false;
   bool HasConvergentOp = false;
+  bool LoopMayFault = false;
+
+  /// Keeps track of the early-exiting block, if present.
+  BasicBlock *SpeculativeEarlyExitingBB = nullptr;
+
+  /// Keeps track of the successor of the early-exiting block, if present.
+  BasicBlock *SpeculativeEarlyExitBB = nullptr;
+
+  /// Keeps track of all the early exits with known or countable exit-not-taken
+  /// counts.
+  SmallVector<BasicBlock *, 4> CountableEarlyExitBlocks;
 
   /// Indicator that there are non vectorizable stores to a uniform address.
   bool HasDependenceInvolvingLoopInvariantAddress = false;
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 5828cc156cc785..562deab8b4159e 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -892,9 +892,13 @@ class ScalarEvolution {
   /// Similar to getBackedgeTakenCount, except it will add a set of
   /// SCEV predicates to Predicates that are required to be true in order for
   /// the answer to be correct. Predicates can be checked with run-time
-  /// checks and can be used to perform loop versioning.
-  const SCEV *getPredicatedBackedgeTakenCount(const Loop *L,
-                                              SmallVector<const SCEVPredicate *, 4> &Predicates);
+  /// checks and can be used to perform loop versioning. If \p Speculative is
+  /// true, this will attempt to return the speculative backedge count for loops
+  /// with early exits. However, this is only possible if we can formulate an
+  /// exact expression for the backedge count from the latch block.
+  const SCEV *getPredicatedBackedgeTakenCount(
+      const Loop *L, SmallVector<const SCEVPredicate *, 4> &Predicates,
+      bool Speculative = false);
 
   /// When successful, this returns a SCEVConstant that is greater than or equal
   /// to (i.e. a "conservative over-approximation") of the value returend by
@@ -912,6 +916,12 @@ class ScalarEvolution {
     return getBackedgeTakenCount(L, SymbolicMaximum);
   }
 
+  /// Return all the exiting blocks in with exact exit counts.
+  void getExactExitingBlocks(const Loop *L,
+                             SmallVector<BasicBlock *, 4> *Blocks) {
+    getBackedgeTakenInfo(L).getExactExitingBlocks(L, this, Blocks);
+  }
+
   /// Return true if the backedge taken count is either the value returned by
   /// getConstantMaxBackedgeTakenCount or zero.
   bool isBackedgeTakenCountMaxOrZero(const Loop *L);
@@ -1534,6 +1544,16 @@ class ScalarEvolution {
     const SCEV *getExact(const Loop *L, ScalarEvolution *SE,
                          SmallVector<const SCEVPredicate *, 4> *Predicates = nullptr) const;
 
+    /// Similar to the above, except we permit unknown exit counts from
+    /// non-latch exit blocks. Any such early exit blocks must dominate the
+    /// latch and so the returned expression represents the speculative, or
+    /// maximum possible, *backedge-taken* count of the loop. If there is no
+    /// exact exit count for the latch this function returns
+    /// SCEVCouldNotCompute.
+    const SCEV *getSpeculative(
+        const Loop *L, ScalarEvolution *SE,
+        SmallVector<const SCEVPredicate *, 4> *Predicates = nullptr) const;
+
     /// Return the number of times this loop exit may fall through to the back
     /// edge, or SCEVCouldNotCompute. The loop is guaranteed not to exit via
     /// this block before this number of iterations, but may exit via another
@@ -1541,6 +1561,10 @@ class ScalarEvolution {
     const SCEV *getExact(const BasicBlock *ExitingBlock,
                          ScalarEvolution *SE) const;
 
+    /// Return all the exiting blocks in with exact exit counts.
+    void getExactExitingBlocks(const Loop *L, ScalarEvolution *SE,
+                               SmallVector<BasicBlock *, 4> *Blocks) const;
+
     /// Get the constant max backedge taken count for the loop.
     const SCEV *getConstantMax(ScalarEvolution *SE) const;
 
@@ -2316,6 +2340,9 @@ class PredicatedScalarEvolution {
   /// Get the (predicated) backedge count for the analyzed loop.
   const SCEV *getBackedgeTakenCount();
 
+  /// Get the (predicated) speculative backedge count for the analyzed loop.
+  const SCEV *getSpeculativeBackedgeTakenCount();
+
   /// Adds a new predicate.
   void addPredicate(const SCEVPredicate &Pred);
 
@@ -2384,6 +2411,9 @@ class PredicatedScalarEvolution {
 
   /// The backedge taken count.
   const SCEV *BackedgeCount = nullptr;
+
+  /// The speculative backedge taken count.
+  const SCEV *SpeculativeBackedgeCount = nullptr;
 };
 
 template <> struct DenseMapInfo<ScalarEvolution::FoldID> {
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index b6534a1962a2f5..c7312dd2371fec 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2513,6 +2513,13 @@ class IRBuilderBase {
     return CreateShuffleVector(V, PoisonValue::get(V->getType()), Mask, Name);
   }
 
+  Value *CreateCountTrailingZeroElems(Type *ResTy, Value *Mask,
+                                      const Twine &Name = "") {
+    return CreateIntrinsic(
+        Intrinsic::experimental_cttz_elts, {ResTy, Mask->getType()},
+        {Mask, getInt1(/*ZeroIsPoison=*/true)}, nullptr, Name);
+  }
+
   Value *CreateExtractValue(Value *Agg, ArrayRef<unsigned> Idxs,
                             const Twine &Name = "") {
     if (auto *V = Folder.FoldExtractValue(Agg, Idxs))
diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h
index d560ca648132c9..83cacf864089cc 100644
--- a/llvm/include/llvm/Support/GenericLoopInfo.h
+++ b/llvm/include/llvm/Support/GenericLoopInfo.h
@@ -294,6 +294,10 @@ template <class BlockT, class LoopT> class LoopBase {
   /// Otherwise return null.
   BlockT *getUniqueExitBlock() const;
 
+  /// Return the exit block for the latch if one exists. This function assumes
+  /// the loop has a latch.
+  BlockT *getLatchExitBlock() const;
+
   /// Return true if this loop does not have any exit blocks.
   bool hasNoExitBlocks() const;
 
diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
index 1e0d0ee446fc41..3beb3e538398ef 100644
--- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h
+++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
@@ -159,6 +159,16 @@ BlockT *LoopBase<BlockT, LoopT>::getUniqueExitBlock() const {
   return getExitBlockHelper(this, true).first;
 }
 
+template <class BlockT, class LoopT>
+BlockT *LoopBase<BlockT, LoopT>::getLatchExitBlock() const {
+  BlockT *Latch = getLoopLatch();
+  assert(Latch && "Latch block must exists");
+  for (BlockT *Successor : children<BlockT *>(Latch))
+    if (!contains(Successor))
+      return Successor;
+  return nullptr;
+}
+
 /// getExitEdges - Return all pairs of (_inside_block_,_outside_block_).
 template <class BlockT, class LoopT>
 void LoopBase<BlockT, LoopT>::getExitEdges(
diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index 62c1e15a9a60e1..05850f864d042a 100644
--- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -124,6 +124,11 @@ class SCEVExpander : public SCEVVisitor<SCEVExpander, Value *> {
   /// "expanded" form.
   bool LSRMode;
 
+  /// If the loop has an early exit we may have to use the speculative backedge
+  /// count, since the normal backedge count function is unable to compute a
+  /// SCEV expression.
+  bool UseSpeculativeBackedgeCount;
+
   typedef IRBuilder<InstSimplifyFolder, IRBuilderCallbackInserter> BuilderType;
   BuilderType Builder;
 
@@ -176,10 +181,12 @@ class SCEVExpander : public SCEVVisitor<SCEVExpander, Value *> {
 public:
   /// Construct a SCEVExpander in "canonical" mode.
   explicit SCEVExpander(ScalarEvolution &se, const DataLayout &DL,
-                        const char *name, bool PreserveLCSSA = true)
+                        const char *name, bool PreserveLCSSA = true,
+                        bool UseSpeculativeBackedgeCount = false)
       : SE(se), DL(DL), IVName(name), PreserveLCSSA(PreserveLCSSA),
         IVIncInsertLoop(nullptr), IVIncInsertPos(nullptr), CanonicalMode(true),
         LSRMode(false),
+        UseSpeculativeBackedgeCount(UseSpeculativeBackedgeCount),
         Builder(se.getContext(), InstSimplifyFolder(DL),
                 IRBuilderCallbackInserter(
                     [this](Instruction *I) { rememberInstruction(I); })) {
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index a509ebf6a7e1b3..20a53abeb2e5cc 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -374,6 +374,24 @@ class LoopVectorizationLegality {
     return LAI->getDepChecker().getMaxSafeVectorWidthInBits();
   }
 
+  /// Returns true if the loop has a early exit with a exact backedge
+  /// count that is speculative.
+  bool hasSpeculativeEarlyExit() const {
+    return LAI && LAI->getSpeculativeEarlyExitingBlock();
+  }
+
+  /// Returns the early exiting block in a loop with a speculative backedge
+  /// count.
+  BasicBlock *getSpeculativeEarlyExitingBlock() const {
+    return LAI->getSpeculativeEarlyExitingBlock();
+  }
+
+  /// Returns the destination of an early exiting block in a loop with a
+  /// speculative backedge count.
+  BasicBlock *getSpeculativeEarlyExitBlock() const {
+    return LAI->getSpeculativeEarlyExitBlock();
+  }
+
   /// Returns true if vector representation of the instruction \p I
   /// requires mask.
   bool isMaskRequired(const Instruction *I) const {
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index f65515ca387229..a49a97402786ae 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -730,6 +730,9 @@ class AccessAnalysis {
     return UnderlyingObjects;
   }
 
+  /// Returns true if we cannot prove the loop will not fault.
+  bool mayFault();
+
 private:
   typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
 
@@ -1281,6 +1284,63 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
   return CanDoRTIfNeeded;
 }
 
+bool AccessAnalysis::mayFault() {
+  auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  for (auto &UO : UnderlyingObjects) {
+    // TODO: For now if we encounter more than one underlying object we just
+    // assume it could fault. However, with more analysis it's possible to look
+    // at all of them and calculate a common range of permitted GEP indices.
+    if (UO.second.size() != 1)
+      return true;
+
+    // For now only the simplest cases are permitted, but this could be
+    // extended further.
+    auto *GEP = dyn_cast<GetElementPtrInst>(UO.first);
+    if (!GEP || GEP->getPointerOperand() != UO.second[0] ||
+        GEP->getNumIndices() != 1)
+      return true;
+
+    // Verify pointer accessed within the loop always falls within the bounds
+    // of the underlying object, but first it's necessary to determine the
+    // object size.
+
+    auto GetKnownObjSize = [&](const Value *Obj) -> uint64_t {
+      // TODO: We should also be able to support global variables too.
+      if (auto *AllocaObj = dyn_cast<AllocaInst>(Obj)) {
+        if (TheLoop->isLoopInvariant(AllocaObj))
+          if (std::optional<TypeSize> AllocaSize =
+                  AllocaObj->getAllocationSize(DL))
+            return !AllocaSize->isScalable() ? AllocaSize->getFixedValue() : 0;
+      } else if (auto *ArgObj = dyn_cast<Argument>(Obj))
+        return ArgObj->getDereferenceableBytes();
+      return 0;
+    };
+
+    uint64_t ObjSize = GetKnownObjSize(UO.second[0]);
+    if (!ObjSize)
+      return true;
+
+    Value *GEPInd = GEP->getOperand(1);
+    const SCEV *IndScev = PSE.getSCEV(GEPInd);
+    if (!isa<SCEVAddRecExpr>(IndScev))
+      return true;
+
+    // Calculate the maximum number of addressable elements in the object.
+    uint64_t ElemSize = GEP->getSourceElementType()->getScalarSizeInBits() / 8;
+    uint64_t MaxNumElems = ObjSize / ElemSize;
+
+    const SCEV *MinScev = PSE.getSE()->getConstant(GEPInd->getType(), 0);
+    const SCEV *MaxScev =
+        PSE.getSE()->getConstant(GEPInd->getType(), MaxNumElems);
+    if (!PSE.getSE()->isKnownOnEveryIteration(
+            ICmpInst::ICMP_SGE, cast<SCEVAddRecExpr>(IndScev), MinScev) ||
+        !PSE.getSE()->isKnownOnEveryIteration(
+            ICmpInst::ICMP_SLT, cast<SCEVAddRecExpr>(IndScev), MaxScev))
+      return true;
+  }
+  return false;
+}
+
 void AccessAnalysis::processMemAccesses() {
   // We process the set twice: first we process read-write pointers, last we
   // process read-only pointers. This allows us to skip dependence tests for
@@ -2346,6 +2406,73 @@ void MemoryDepChecker::Dependence::print(
   OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
 }
 
+bool LoopAccessInfo::isAnalyzableEarlyExitLoop() {
+  // At least one of the exiting blocks must be the latch.
+  BasicBlock *LatchBB = TheLoop->getLoopLatch();
+  if (!LatchBB)
+    return false;
+
+  SmallVector<BasicBlock *, 8> ExitingBlocks;
+  TheLoop->getExitingBlocks(ExitingBlocks);
+
+  // This is definitely not an early exit loop.
+  if (ExitingBlocks.size() < 2)
+    return false;
+
+  SmallVector<BasicBlock *, 4> ExactExitingBlocks;
+  PSE->getSE()->getExactExitingBlocks(TheLoop, &ExactExitingBlocks);
+
+  // We only support one speculative early exit.
+  if ((ExitingBlocks.size() - ExactExitingBlocks.size()) > 1)
+    return false;
+
+  // There could be multiple exiting blocks with an exact exit-not-taken
+  // count. Find the speculative early exit block, i.e. the one with an
+  // unknown count.
+  BasicBlock *TmpBB = nullptr;
+  for (BasicBlock *BB1 : ExitingBlocks) {
+    bool Found = false;
+    for (BasicBlock *BB2 : ExactExitingBlocks)
+      if (BB1 == BB2) {
+        Found = true;
+        break;
+      }
+    if (!Found) {
+      TmpBB = BB1;
+      break;
+    }
+  }
+  assert(TmpBB && "Expected to find speculative early exiting block");
+
+  // For now, let's keep things simple by ensuring the latch block only has
+  // the exiting block as a predecessor.
+  BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor();
+  if (!LatchPredBB || LatchPredBB != TmpBB)
+    return false;
+
+  LLVM_DEBUG(
+      dbgs()
+      << "LAA: Found an early exit. Retrying with speculative exit count.\n");
+  const SCEV *SpecExitCount = PSE->getSpeculativeBackedgeTakenCount();
+  if (isa<SCEVCouldNotCompute>(SpecExitCount))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "LAA: Found speculative backedge taken count: "
+                    << *SpecExitCount << '\n');
+  SpeculativeEarlyExitingBB = TmpBB;
+
+  for (BasicBlock *BB : successors(SpeculativeEarlyExitingBB))
+    if (BB != LatchBB) {
+      SpeculativeEarlyExitBB = BB;
+      break;
+    }
+  assert(SpeculativeEarlyExitBB &&
+         "Expected to find speculative early exit block");
+  CountableEarlyExitBlocks = std::move(ExactExitingBlocks);
+
+  return true;
+}
+
 bool LoopAccessInfo::canAnalyzeLoop() {
   // We need to have a loop header.
   LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
@@ -2371,10 +2498,12 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   // ScalarEvolution needs to be able to find the exit count.
   const SCEV *ExitCount = PSE->getBackedgeTakenCount();
   if (isa<SCEVCouldNotCompute>(ExitCount)) {
-    recordAnalysis("CantComputeNumberOfIterations")
-        << "could not determine number of loop iterations";
     LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
-    return false;
+    if (!isAnalyzableEarlyExitLoop()) {
+      recordAnalysis("CantComputeNumberOfIterations")
+          << "could not determine number of loop iterations";
+      return false;
+    }
   }
 
   return true;
@@ -2406,6 +2535,9 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
       EnableMemAccessVersioning &&
       !TheLoop->getHeader()->getParent()->hasOptSize();
 
+  BasicBlock *LatchBB = TheLoop->getLoopLatch();
+  bool HasComplexWorkInEarlyExitLoop = false;
+
   // Traverse blocks in fixed RPOT order, regardless of their storage in the
   // loop info, as it may be arbitrary.
   LoopBlocksRPO RPOT(TheLoop);
@@ -2421,7 +2553,8 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
 
       // With both a non-vectorizable memory instruction and a convergent
       // operation, found in this loop, no reason to continue the search.
-      if (HasComplexMemInst && HasConvergentOp) {
+      if ((HasComplexMemInst && HasConvergentOp) ||
+          HasComplexWorkInEarlyExitLoop) {
         CanVecMem = false;
         return;
       }
@@ -2439,6 +2572,14 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
       // vectorize a loop if it contains known function calls that don't set
       // the flag. Therefore, it is safe to ignore this read from memory.
       auto *Call = dyn_cast<CallInst>(&I);
+      if (Call && SpeculativeEarlyExitingBB) {
+        recordAnalysis("CantVectorizeInstruction", Call)
+            << "cannot vectorize calls in early exit loop";
+        LLVM_DEBUG(dbgs() << "LAA: Found a call in early exit loop.\n");
+        HasComplexWorkInEarlyExitLoop = true;
+        continue;
+      }
+
       if (Call && getVectorIntrinsicIDForCall(Call, TLI))
         continue;
 
@@ -2466,6 +2607,13 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
           HasComplexMemInst = true;
           continue;
         }
+        if (SpeculativeEarlyExitingBB && BB == LatchBB) {
+          recordAnalysis("CantVectorizeInstruction", Call)
+              << "cannot vectorize loads after early exit block";
+          LLVM_DEBUG(dbgs() << "LAA: Found a load after early exit.\n");
+          HasComplexWorkInEarlyExitLoop = true;
+          continue;
+        }
         NumLoads++;
         Loads.push_back(Ld);
         DepChecker->addAccess(Ld);
@@ -2477,6 +2625,13 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
       // Save 'store' instructions. Abort if other instructions write to memory.
       if (I.mayWriteToMemory()) {
         auto *St = dyn_cast<StoreInst>(&I);
+        if (SpeculativeEarlyExitingBB) {
+          recordAnalysis("CantVectorizeInstruction", St)
+              << "cannot vectorize stores in early exit loop";
+          LLVM_DEBUG(dbgs() << "LAA: Found a store in early exit loop.\n");
+          HasComplexWorkInEarlyExitLoop = true;
+          continue;
+        }
         if (!St) {
           recordAnalysis("CantVectorizeInstruction", St)
               << "instruction cannot be vectorized";
@@ -2499,7 +2654,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
     } // Next instr.
   } // Next block.
 
-  if (HasComplexMemInst) {
+  if (HasComplexMemInst || HasComplexWorkInEarlyExitLoop) {
     CanVecMem = false;
     return;
   }
@@ -2507,14 +2662,21 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   // Now we have two lists that hold the loads and the stores.
   // Next, we find the pointers that they use.
 
+  // If there is at least one memory access in the loop then it may fault. We
+  // will try to refine this later for certain early exit loops.
+  LoopMayFault = true;
+
   // Check if we see any stores. If there are no stores, then we don't
   // care if the pointers are *restrict*.
-  if (!Stores.size()) {
+  if (!Stores.size() && !SpeculativeEarlyExitingBB) {
     LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
     CanVecMem = true;
     return;
   }
 
+  assert(!Stores.size() || !SpeculativeEarlyExitingBB &&
+                               "Did not expect stores in an early exit loop!");
+
   MemoryDepChecker::DepCandidates DependentAccesses;
   AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE,
                           LoopAliasScopes);
@@ -2561,7 +2723,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
     }
   }
 
-  if (IsAnnotatedParallel) {
+  if (IsAnnotatedParallel && !SpeculativeEarlyExitingBB) {
     LLVM_DEBUG(
         dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
                << "checks.\n");
@@ -2611,7 +2773,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
 
   // If we write (or read-write) to a single destination and there are no
   // other reads in this loop then is it safe to vectorize.
-  if (NumReadWrites == 1 && NumReads == 0) {
+  if (!SpeculativeEarlyExitingBB && NumReadWrites == 1 && NumReads == 0) {
     LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
     CanVecMem = true;
     return;
@@ -2621,6 +2783,13 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   // check.
   Accesses.buildDependenceSets();
 
+  if (SpeculativeEarlyExitingBB) {
+    assert(!Stores.size() && "Did not expect stores in an early exit loop!");
+    LoopMayFault = Accesses.mayFault();
+    CanVecMem = true;
+    return;
+  }
+
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   Value *UncomputablePtr = nullptr;
@@ -2977,7 +3146,9 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
   // of various possible stride specializations, considering the alternatives
   // of using gather/scatters (if available).
 
-  const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
+  const SCEV *BETakenCount = SpeculativeEarlyExitingBB
+                                 ? PSE->getSpeculativeBackedgeTakenCount()
+                                 : PSE->getBackedgeTakenCount();
 
   // Match the types so we can compare the stride and the BETakenCount.
   // The Stride can be positive/negative, so we sign extend Stride;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 93f885c5d5ad8b..9bb4a3f7fca2c0 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -8276,10 +8276,13 @@ const SCEV *ScalarEvolution::getExitCount(const Loop *L,
   llvm_unreachable("Invalid ExitCountKind!");
 }
 
-const SCEV *
-ScalarEvolution::getPredicatedBackedgeTakenCount(const Loop *L,
-                                                 SmallVector<const SCEVPredicate *, 4> &Preds) {
-  return getPredicatedBackedgeTakenInfo(L).getExact(L, this, &Preds);
+const SCEV *ScalarEvolution::getPredicatedBackedgeTakenCount(
+    const Loop *L, SmallVector<const SCEVPredicate *, 4> &Preds,
+    bool Speculative) {
+  if (Speculative)
+    return getPredicatedBackedgeTakenInfo(L).getSpeculative(L, this, &Preds);
+  else
+    return getPredicatedBackedgeTakenInfo(L).getExact(L, this, &Preds);
 }
 
 const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L,
@@ -8599,6 +8602,66 @@ ScalarEvolution::BackedgeTakenInfo::getExact(const Loop *L, ScalarEvolution *SE,
   return SE->getUMinFromMismatchedTypes(Ops, /* Sequential */ true);
 }
 
+void ScalarEvolution::BackedgeTakenInfo::getExactExitingBlocks(
+    const Loop *L, ScalarEvolution *SE,
+    SmallVector<BasicBlock *, 4> *Blocks) const {
+  // All exiting blocks we have collected must dominate the only backedge.
+  const BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch || !hasAnyInfo())
+    return;
+
+  for (const auto &ENT : ExitNotTaken) {
+    const SCEV *BECount = ENT.ExactNotTaken;
+    if (BECount == SE->getCouldNotCompute())
+      continue;
+    Blocks->push_back(ENT.ExitingBlock);
+  }
+
+  return;
+}
+
+const SCEV *ScalarEvolution::BackedgeTakenInfo::getSpeculative(
+    const Loop *L, ScalarEvolution *SE,
+    SmallVector<const SCEVPredicate *, 4> *Preds) const {
+  // All exiting blocks we have collected must dominate the only backedge.
+  const BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch)
+    return SE->getCouldNotCompute();
+
+  if (!hasAnyInfo())
+    return SE->getCouldNotCompute();
+
+  // All exiting blocks we have gathered dominate loop's latch, so speculative
+  // trip count is simply a minimum out of all these calculated exit counts.
+  SmallVector<const SCEV *, 2> Ops;
+  bool FoundLatch = false;
+  for (const auto &ENT : ExitNotTaken) {
+    const SCEV *BECount = ENT.ExactNotTaken;
+    if (BECount == SE->getCouldNotCompute())
+      continue;
+
+    assert(SE->DT.dominates(ENT.ExitingBlock, Latch) &&
+           "We should only have known counts for exiting blocks that dominate "
+           "latch!");
+    Ops.push_back(BECount);
+    if (Preds)
+      for (const auto *P : ENT.Predicates)
+        Preds->push_back(P);
+    assert((Preds || ENT.hasAlwaysTruePredicate()) &&
+           "Predicate should be always true!");
+    if (ENT.ExitingBlock == Latch)
+      FoundLatch = true;
+  }
+
+  if (!FoundLatch)
+    return SE->getCouldNotCompute();
+
+  // If an earlier exit exits on the first iteration (exit count zero), then
+  // a later poison exit count should not propagate into the result. This are
+  // exactly the semantics provided by umin_seq.
+  return SE->getUMinFromMismatchedTypes(Ops, /* Sequential */ true);
+}
+
 /// Get the exact not taken count for this loop exit.
 const SCEV *
 ScalarEvolution::BackedgeTakenInfo::getExact(const BasicBlock *ExitingBlock,
@@ -13564,8 +13627,15 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
     if (!isa<SCEVCouldNotCompute>(PBT)) {
       OS << "Predicated backedge-taken count is ";
       PrintSCEVWithTypeHint(OS, PBT);
-    } else
-      OS << "Unpredictable predicated backedge-taken count.";
+    } else {
+      SmallVector<const SCEVPredicate *, 4> SpecPreds;
+      PBT = SE->getPredicatedBackedgeTakenCount(L, SpecPreds);
+      if (!isa<SCEVCouldNotCompute>(PBT)) {
+        OS << "Speculative predicated backedge-taken count is ";
+        PrintSCEVWithTypeHint(OS, PBT);
+      } else
+        OS << "Unpredictable predicated backedge-taken count.";
+    }
     OS << "\n";
     OS << " Predicates:\n";
     for (const auto *P : Preds)
@@ -14783,6 +14853,18 @@ const SCEV *PredicatedScalarEvolution::getBackedgeTakenCount() {
   return BackedgeCount;
 }
 
+const SCEV *PredicatedScalarEvolution::getSpeculativeBackedgeTakenCount() {
+  if (!SpeculativeBackedgeCount) {
+    SmallVector<const SCEVPredicate *, 4> Preds;
+    SpeculativeBackedgeCount =
+        SE.getPredicatedBackedgeTakenCount(&L, Preds, true);
+    // TODO: Should we be adding these to a different set of predicates?
+    for (const auto *P : Preds)
+      addPredicate(*P);
+  }
+  return SpeculativeBackedgeCount;
+}
+
 void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
   if (Preds->implies(&Pred))
     return;
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 0feea0a4233cd1..5ba6a17207aa97 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -2078,8 +2078,8 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
 
   // FIXME: It is highly suspicious that we're ignoring the predicates here.
   SmallVector<const SCEVPredicate *, 4> Pred;
-  const SCEV *ExitCount =
-      SE.getPredicatedBackedgeTakenCount(AR->getLoop(), Pred);
+  const SCEV *ExitCount = SE.getPredicatedBackedgeTakenCount(
+      AR->getLoop(), Pred, UseSpeculativeBackedgeCount);
 
   assert(!isa<SCEVCouldNotCompute>(ExitCount) && "Invalid loop count");
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index d33743e74cbe31..c85ec4324cfcaf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1498,6 +1498,16 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
       return false;
   }
 
+  if (getLAI() && getLAI()->getSpeculativeEarlyExitingBlock() &&
+      (getReductionVars().size() || getFixedOrderRecurrences().size())) {
+    reportVectorizationFailure(
+        "Found reductions or recurrences in early-exit loop",
+        "vectorizer cannot handle early exit loops with reductions or "
+        "recurrences",
+        "CFGNotUnderstood", ORE, TheLoop);
+    return false;
+  }
+
   LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
                     << (LAI->getRuntimePointerChecking()->Need
                             ? " (with a runtime bound check)"
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33c4decd58a6c2..4cb5926503581b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -181,6 +181,12 @@ static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
     cl::desc("Enable vectorization of epilogue loops."));
 
+static cl::opt<bool> AssumeNoMemFault(
+    "vectorizer-no-mem-fault", cl::init(false), cl::Hidden,
+    cl::desc("Assume vectorized loops will not have memory faults, which is "
+             "potentially unsafe but can be useful for testing vectorization "
+             "of early exit loops."));
+
 static cl::opt<unsigned> EpilogueVectorizationForceVF(
     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
     cl::desc("When epilogue vectorization is enabled, and a value greater than "
@@ -603,8 +609,12 @@ class InnerLoopVectorizer {
   /// Set up the values of the IVs correctly when exiting the vector loop.
   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
                     Value *VectorTripCount, Value *EndValue,
-                    BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
-                    VPlan &Plan, VPTransformState &State);
+                    BasicBlock *MiddleBlock, VPlan &Plan,
+                    VPTransformState &State);
+
+  void fixupEarlyExitIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
+                             BasicBlock *VectorEarlyExitBB, VPlan &Plan,
+                             VPTransformState &State);
 
   /// Create the exit value of first order recurrences in the middle block and
   /// update their users.
@@ -979,8 +989,10 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
 }
 
 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
-                                Loop *OrigLoop) {
-  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
+                                Loop *OrigLoop, bool IsSpeculative) {
+  const SCEV *BackedgeTakenCount = IsSpeculative
+                                       ? PSE.getSpeculativeBackedgeTakenCount()
+                                       : PSE.getBackedgeTakenCount();
   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
 
   ScalarEvolution &SE = *PSE.getSE();
@@ -1466,10 +1478,24 @@ class LoopVectorizationCostModel {
   bool requiresScalarEpilogue(bool IsVectorizing) const {
     if (!isScalarEpilogueAllowed())
       return false;
-    // If we might exit from anywhere but the latch, must run the exiting
-    // iteration in scalar form.
-    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
+    // For normal loops with countable early exits, if we exit from anywhere
+    // but the latch we require a scalar epilogue.
+    if (!Legal->hasSpeculativeEarlyExit() &&
+        TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
       return true;
+    // If this is a loop with a speculative early exit, then we may validly
+    // exit from a non-latch block and not require a scalar epilogue for the
+    // last iteration, since these exits are handled specially. However, since
+    // we could have both countable and speculative exits we must search all
+    // the exits.
+    if (Legal->hasSpeculativeEarlyExit()) {
+      const SmallVector<BasicBlock *, 4> &CountableExitingBlocks =
+          Legal->getLAI()->getCountableEarlyExitingBlocks();
+      if (CountableExitingBlocks.size() > 1 ||
+          (CountableExitingBlocks.size() == 1 &&
+           CountableExitingBlocks[0] != TheLoop->getLoopLatch()))
+        return true;
+    }
     return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
   }
 
@@ -1904,9 +1930,11 @@ class GeneratedRTChecks {
 public:
   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
                     TargetTransformInfo *TTI, const DataLayout &DL,
-                    bool AddBranchWeights)
-      : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
-        MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
+                    bool AddBranchWeights, bool LoophasSpeculativeEarlyExit)
+      : DT(DT), LI(LI), TTI(TTI),
+        SCEVExp(SE, DL, "scev.check", true, LoophasSpeculativeEarlyExit),
+        MemCheckExp(SE, DL, "scev.check", true, LoophasSpeculativeEarlyExit),
+        AddBranchWeights(AddBranchWeights) {}
 
   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
   /// accurately estimate the cost of the runtime checks. The blocks are
@@ -2968,7 +2996,12 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopScalarBody = OrigLoop->getHeader();
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   assert(LoopVectorPreHeader && "Invalid loop structure");
-  LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
+
+  if (Cost->Legal->hasSpeculativeEarlyExit())
+    LoopExitBlock = OrigLoop->getLatchExitBlock();
+  else
+    LoopExitBlock = OrigLoop->getUniqueExitBlock();
+
   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
          "multiple exit loop without required epilogue?");
 
@@ -3000,11 +3033,12 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   // Update dominator for loop exit. During skeleton creation, only the vector
   // pre-header and the middle block are created. The vector loop is entirely
   // created during VPlan exection.
-  if (!Cost->requiresScalarEpilogue(VF.isVector()))
+  if (!Cost->requiresScalarEpilogue(VF.isVector())) {
     // If there is an epilogue which must run, there's no edge from the
     // middle block to exit blocks  and thus no need to update the immediate
     // dominator of the exit blocks.
     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
+  }
 }
 
 PHINode *InnerLoopVectorizer::createInductionResumeValue(
@@ -3216,26 +3250,33 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
                                        const InductionDescriptor &II,
                                        Value *VectorTripCount, Value *EndValue,
-                                       BasicBlock *MiddleBlock,
-                                       BasicBlock *VectorHeader, VPlan &Plan,
+                                       BasicBlock *MiddleBlock, VPlan &Plan,
                                        VPTransformState &State) {
   // There are two kinds of external IV usages - those that use the value
   // computed in the last iteration (the PHI) and those that use the penultimate
   // value (the value that feeds into the phi from the loop latch).
   // We allow both, but they, obviously, have different values.
 
-  assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
+  assert((OrigLoop->getUniqueExitBlock() || Legal->hasSpeculativeEarlyExit()) &&
+         "Expected a single exit block");
 
   DenseMap<Value *, Value *> MissingVals;
 
   // An external user of the last iteration's value should see the value that
   // the remainder loop uses to initialize its own IV.
-  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
+  BasicBlock *OrigLoopLatch = OrigLoop->getLoopLatch();
+  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoopLatch);
   for (User *U : PostInc->users()) {
     Instruction *UI = cast<Instruction>(U);
     if (!OrigLoop->contains(UI)) {
-      assert(isa<PHINode>(UI) && "Expected LCSSA form");
-      MissingVals[UI] = EndValue;
+      PHINode *PHI = dyn_cast<PHINode>(UI);
+      assert(PHI && "Expected LCSSA form");
+      // Just because the user is outside the loop it doesn't mean the incoming
+      // value is always from the latch block. This could be an early exit loop
+      // with multiple paths to the same successor.
+      int Index = PHI->getBasicBlockIndex(OrigLoopLatch);
+      if (Index != -1 && PHI->getIncomingValue(Index) == PostInc)
+        MissingVals[PHI] = EndValue;
     }
   }
 
@@ -3245,7 +3286,12 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
   for (User *U : OrigPhi->users()) {
     auto *UI = cast<Instruction>(U);
     if (!OrigLoop->contains(UI)) {
-      assert(isa<PHINode>(UI) && "Expected LCSSA form");
+      PHINode *PHI = dyn_cast<PHINode>(UI);
+      assert(PHI && "Expected LCSSA form");
+      int Index = PHI->getBasicBlockIndex(OrigLoopLatch);
+      if (Index == -1 || PHI->getIncomingValue(Index) != OrigPhi)
+        continue;
+
       IRBuilder<> B(MiddleBlock->getTerminator());
 
       // Fast-math-flags propagate from the original induction instruction.
@@ -3282,6 +3328,94 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
   }
 }
 
+void InnerLoopVectorizer::fixupEarlyExitIVUsers(PHINode *OrigPhi,
+                                                const InductionDescriptor &II,
+                                                BasicBlock *VectorEarlyExitBB,
+                                                VPlan &Plan,
+                                                VPTransformState &State) {
+  // There are two kinds of external IV usages - those that use the value
+  // computed in the last iteration (the PHI) and those that use the penultimate
+  // value (the value that feeds into the phi from the loop latch).
+  // We allow both, but they, obviously, have different values.
+  DenseMap<Value *, Value *> MissingVals;
+  BasicBlock *OrigEarlyExitingBlock = Legal->getSpeculativeEarlyExitingBlock();
+  BasicBlock *OrigLoopLatch = OrigLoop->getLoopLatch();
+  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoopLatch);
+
+  auto FixUpPhi = [&](Instruction *UI, bool PostInc) -> Value * {
+    IRBuilder<> B(VectorEarlyExitBB->getTerminator());
+    assert(isa<PHINode>(UI) && "Expected LCSSA form");
+
+    // Fast-math-flags propagate from the original induction instruction.
+    if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
+      B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
+
+    // We need to discover the mask that led us into the early exit block.
+    Value *EarlyExitMask = Plan.getVectorLoopRegion()->getEarlyExitMask(&State);
+    VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+    Type *CtzType = CanonicalIV->getStartValue()->getLiveInIRValue()->getType();
+    Value *Ctz;
+    if (EarlyExitMask)
+      Ctz = B.CreateCountTrailingZeroElems(CtzType, EarlyExitMask);
+    else
+      Ctz = ConstantInt::get(CtzType, 0);
+    Ctz = B.CreateAdd(Ctz,
+                      cast<PHINode>(State.get(CanonicalIV->getVPSingleValue(),
+                                              0, /*IsScalar=*/true)));
+    if (PostInc)
+      Ctz = B.CreateAdd(Ctz, ConstantInt::get(CtzType, 1));
+
+    Value *Escape = nullptr;
+    VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
+    assert(StepVPV && "step must have been expanded during VPlan execution");
+    Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
+                                      : State.get(StepVPV, {0, 0});
+    Escape = emitTransformedIndex(B, Ctz, II.getStartValue(), Step,
+                                  II.getKind(), II.getInductionBinOp());
+    Escape->setName("ind.early.escape");
+
+    return Escape;
+  };
+
+  for (User *U : PostInc->users()) {
+    // This assumes if it's not in the loop then it must be the normal
+    // exit block. However, it could be a user in an early exit block different
+    // to the latch's exit block.
+    auto *UI = cast<Instruction>(U);
+    if (!OrigLoop->contains(UI)) {
+      PHINode *PHI = dyn_cast<PHINode>(UI);
+      assert(PHI && "Expected LCSSA form");
+      int Index = PHI->getBasicBlockIndex(OrigEarlyExitingBlock);
+      if (Index != -1 && PHI->getIncomingValue(Index) == PostInc)
+        MissingVals[UI] = FixUpPhi(UI, true);
+    }
+  }
+
+  for (User *U : OrigPhi->users()) {
+    auto *UI = cast<Instruction>(U);
+    if (!OrigLoop->contains(UI)) {
+      PHINode *PHI = dyn_cast<PHINode>(UI);
+      assert(PHI && "Expected LCSSA form");
+      int Index = PHI->getBasicBlockIndex(OrigEarlyExitingBlock);
+      if (Index != -1 && PHI->getIncomingValue(Index) == OrigPhi)
+        MissingVals[UI] = FixUpPhi(UI, false);
+    }
+  }
+
+  for (auto &I : MissingVals) {
+    PHINode *PHI = cast<PHINode>(I.first);
+    // One corner case we have to handle is two IVs "chasing" each-other,
+    // that is %IV2 = phi [...], [ %IV1, %latch ]
+    // In this case, if IV1 has an external use, we need to avoid adding both
+    // "last value of IV1" and "penultimate value of IV2". So, verify that we
+    // don't already have an incoming value for the middle block.
+    if (PHI->getBasicBlockIndex(VectorEarlyExitBB) == -1) {
+      PHI->addIncoming(I.second, VectorEarlyExitBB);
+      Plan.removeEarlyExitLiveOut(PHI);
+    }
+  }
+}
+
 namespace {
 
 struct CSEDenseMapInfo {
@@ -3438,6 +3572,22 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
 
   VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
+
+  BasicBlock *VectorEarlyExitBB = nullptr;
+  if (VectorRegion->getEarlyExit()) {
+    // Fix-up external users of the induction variables.
+    VPBasicBlock *VectorEarlyExitVPBB =
+        cast<VPBasicBlock>(VectorRegion->getEarlyExit());
+    VectorEarlyExitBB = State.CFG.VPBB2IRBB[VectorEarlyExitVPBB];
+    for (const auto &Entry : Legal->getInductionVars())
+      fixupEarlyExitIVUsers(Entry.first, Entry.second, VectorEarlyExitBB, Plan,
+                            State);
+
+    BasicBlock *OrigEarlyExitBB = Legal->getSpeculativeEarlyExitBlock();
+    if (Loop *EEL = LI->getLoopFor(OrigEarlyExitBB))
+      EEL->addBasicBlockToLoop(VectorEarlyExitBB, *LI);
+  }
+
   if (Cost->requiresScalarEpilogue(VF.isVector())) {
     // No edge from the middle block to the unique exit block has been inserted
     // and there is nothing to fix from vector loop; phis should have incoming
@@ -3454,16 +3604,28 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
     for (const auto &Entry : Legal->getInductionVars())
       fixupIVUsers(Entry.first, Entry.second,
                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
-                   IVEndValues[Entry.first], LoopMiddleBlock,
-                   VectorLoop->getHeader(), Plan, State);
+                   IVEndValues[Entry.first], LoopMiddleBlock, Plan, State);
   }
 
   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
   // in the exit block, so update the builder.
+  BasicBlock *OrigEarlyExitBB = Legal->hasSpeculativeEarlyExit()
+                                    ? Legal->getSpeculativeEarlyExitBlock()
+                                    : nullptr;
+
   State.Builder.SetInsertPoint(State.CFG.ExitBB,
                                State.CFG.ExitBB->getFirstNonPHIIt());
   for (const auto &KV : Plan.getLiveOuts())
-    KV.second->fixPhi(Plan, State);
+    if (OrigLoop->getUniqueExitBlock() ||
+        KV.second->getPhi()->getParent() != OrigEarlyExitBB)
+      KV.second->fixPhi(Plan, State);
+
+  if (VectorEarlyExitBB) {
+    State.Builder.SetInsertPoint(VectorEarlyExitBB->getTerminator());
+    for (const auto &KV : Plan.getEarlyExitLiveOuts())
+      if (KV.second->getPhi()->getParent() == OrigEarlyExitBB)
+        KV.second->fixPhi(Plan, State);
+  }
 
   for (Instruction *PI : PredicatedInstructions)
     sinkScalarOperands(&*PI);
@@ -4622,7 +4784,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   // a bottom-test and a single exiting block. We'd have to handle the fact
   // that not every instruction executes on the last iteration.  This will
   // require a lane mask which varies through the vector loop body.  (TODO)
-  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+  if (Legal->hasSpeculativeEarlyExit() ||
+      TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
     // If there was a tail-folding hint/switch, but we can't fold the tail by
     // masking, fallback to a vectorization with a scalar epilogue.
     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
@@ -5087,7 +5250,9 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
   // Epilogue vectorization code has not been auditted to ensure it handles
   // non-latch exits properly.  It may be fine, but it needs auditted and
   // tested.
-  if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
+  // TODO: Add support for loops with an early exit.
+  if (Legal->hasSpeculativeEarlyExit() ||
+      OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
     return false;
 
   return true;
@@ -5194,7 +5359,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
       // TODO: extend to support scalable VFs.
       if (!RemainingIterations) {
-        const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
+        const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop,
+                                             Legal->hasSpeculativeEarlyExit());
         RemainingIterations = SE.getURemExpr(
             TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
       }
@@ -5321,6 +5487,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   if (!Legal->isSafeForAnyVectorWidth())
     return 1;
 
+  // We don't attempt to perform interleaving for early exit loops.
+  if (Legal->hasSpeculativeEarlyExit())
+    return 1;
+
   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
   const bool HasReductions = !Legal->getReductionVars().empty();
 
@@ -6895,7 +7065,20 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
       // The back-edge branch will remain, as will all scalar branches.
       return TTI.getCFInstrCost(Instruction::Br, CostKind);
-    else
+    else if (I->getParent() == Legal->getSpeculativeEarlyExitingBlock()) {
+      // In order to determine whether we take an early exit or not we have to
+      // perform an or reduction of the vector predicate.
+      auto *Vec_i1Ty =
+          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
+      InstructionCost EECost = TTI.getArithmeticReductionCost(
+          Instruction::Or, Vec_i1Ty, std::nullopt, CostKind);
+      // Add on the cost of the conditional branch, which will remain.
+      EECost += TTI.getCFInstrCost(Instruction::Br, CostKind);
+      // TODO: The vector loop early exit block also needs to do work to
+      // determine the first lane that triggered the exit. We should probably
+      // add that somehow, but the cost will be negligible for long loops.
+      return EECost;
+    } else
       // This branch will be eliminated by if-conversion.
       return 0;
     // Note: We currently assume zero cost for an unconditional branch inside
@@ -8589,13 +8772,19 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
 
 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
 // original exit block.
-static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
-                                VPRecipeBuilder &Builder, VPlan &Plan) {
-  BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
-  BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
-  // Only handle single-exit loops with unique exit blocks for now.
-  if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
-    return;
+static void addUsersInExitBlock(Loop *OrigLoop, VPRecipeBuilder &Builder,
+                                VPlan &Plan) {
+  BasicBlock *ExitBB, *ExitingBB;
+
+  if (Plan.getVectorLoopRegion()->getEarlyExit()) {
+    ExitingBB = OrigLoop->getLoopLatch();
+    ExitBB = OrigLoop->getLatchExitBlock();
+  } else {
+    ExitBB = OrigLoop->getUniqueExitBlock();
+    ExitingBB = OrigLoop->getExitingBlock();
+    if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
+      return;
+  }
 
   // Introduce VPUsers modeling the exit values.
   for (PHINode &ExitPhi : ExitBB->phis()) {
@@ -8606,6 +8795,52 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
   }
 }
 
+// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
+// original exit block.
+static void addUsersInEarlyExitBlock(Loop *OrigLoop, BasicBlock *EarlyExitingBB,
+                                     BasicBlock *EarlyExitBB,
+                                     VPRecipeBuilder &Builder, VPlan &Plan) {
+  // Introduce VPUsers modeling the exit values.
+  for (PHINode &ExitPhi : EarlyExitBB->phis()) {
+    Value *IncomingValue = ExitPhi.getIncomingValueForBlock(EarlyExitingBB);
+    VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan);
+    Plan.addEarlyExitLiveOut(&ExitPhi, V);
+  }
+}
+
+static VPValue *getConditionForVectorEarlyExit(Loop *OrigLoop,
+                                               BasicBlock *ExitingBB,
+                                               VPlan &Plan, VPBuilder &Builder,
+                                               VPRecipeBuilder &RecipeBuilder,
+                                               VPRecipeBase *VPEarlyExitCond) {
+  // To make things easier we canonicalise the condition so that 'true'
+  // means take the early exit.
+  auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+
+  // If the true destination is in the loop then we want to invert the
+  // condition so that true means early exit.
+  bool NeedsInvert = OrigLoop->contains(BI->getSuccessor(0));
+
+  VPValue *ScalarExitCond;
+  if (!VPEarlyExitCond) {
+    // If we didn't find the exit condition, then this must have been
+    // defined outside the loop and is loop invariant.
+    ScalarExitCond =
+        RecipeBuilder.getVPValueOrAddLiveIn(BI->getCondition(), Plan);
+    if (NeedsInvert)
+      ScalarExitCond = Builder.createNot(ScalarExitCond);
+  } else {
+    VPValue *EarlyExitMask = VPEarlyExitCond->getVPSingleValue();
+    if (NeedsInvert)
+      EarlyExitMask = Builder.createNot(EarlyExitMask);
+    Plan.getVectorLoopRegion()->setVectorEarlyExitCond(EarlyExitMask);
+    // If any lane of EarlyExitMask would be true we should exit the loop.
+    ScalarExitCond =
+        Builder.createNaryOp(VPInstruction::OrReduction, {EarlyExitMask});
+  }
+  return ScalarExitCond;
+}
+
 VPlanPtr
 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
@@ -8622,7 +8857,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   // the vector loop, followed by the middle basic block. The skeleton vector
   // loop region contains a header and latch basic blocks.
   VPlanPtr Plan = VPlan::createInitialVPlan(
-      createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
+      createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop,
+                          Legal->hasSpeculativeEarlyExit()),
       *PSE.getSE());
   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
@@ -8630,6 +8866,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
   Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
 
+  VPBasicBlock *EarlyExitVPBB = nullptr;
+  BasicBlock *EarlyExitingBB = nullptr;
+  if (Legal->hasSpeculativeEarlyExit()) {
+    EarlyExitingBB = Legal->getSpeculativeEarlyExitingBlock();
+    EarlyExitVPBB = new VPBasicBlock("vector.early.exit");
+    Plan->getVectorLoopRegion()->setEarlyExit(EarlyExitVPBB);
+    // Needed for updating the dominator tree.
+    Plan->getVectorLoopRegion()->setOrigEarlyExit(
+        Legal->getSpeculativeEarlyExitBlock());
+  }
+
   // Don't use getDecisionAndClampRange here, because we don't know the UF
   // so this function is better to be conservative, rather than to split
   // it up into different VPlans.
@@ -8690,6 +8937,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
         return Legal->blockNeedsPredication(BB) || NeedsBlends;
       });
+
+  // If we find the recipe for the early exit condition we need to record it
+  // so that we can then generate the new vector exit condition.
+  VPRecipeBase *VPEarlyExitCond = nullptr;
+  Value *EarlyExitCond = nullptr;
+  if (EarlyExitingBB) {
+    BranchInst *BI = cast<BranchInst>(EarlyExitingBB->getTerminator());
+    EarlyExitCond = BI->getCondition();
+  }
+
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
     // Relevant instructions from basic block BB will be grouped into VPRecipe
     // ingredients and fill a new VPBasicBlock.
@@ -8728,6 +8985,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
       if (!Recipe)
         Recipe = RecipeBuilder.handleReplication(Instr, Range);
 
+      if (&I == EarlyExitCond)
+        VPEarlyExitCond = Recipe;
+
       RecipeBuilder.setRecipe(Instr, Recipe);
       if (isa<VPHeaderPHIRecipe>(Recipe)) {
         // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
@@ -8746,19 +9006,47 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         VPBB->appendRecipe(Recipe);
     }
 
-    VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
-    VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
+    // If this is an early exit block we need to do more work to generate the
+    // actual exit condition. We generate an or reduction of the vector
+    // condition so that we exit the loop if any lane of the vector would cause
+    // us to exit.
+    if (BB == EarlyExitingBB) {
+      VPValue *ScalarExitCond = getConditionForVectorEarlyExit(
+          OrigLoop, BB, *Plan, Builder, RecipeBuilder, VPEarlyExitCond);
+
+      // Branch to early exit BB.
+      auto *NewBR =
+          new VPInstruction(VPInstruction::BranchOnCond, {ScalarExitCond});
+      RecipeBuilder.setRecipe(cast<BranchInst>(BB->getTerminator()), NewBR);
+      VPBB->appendRecipe(NewBR);
+
+      VPBasicBlock *InLoopVPBB = new VPBasicBlock();
+
+      // Surely there should only be one succesor?!
+      VPBlockBase *Successor = VPBB->getSingleSuccessor();
+      VPBlockUtils::disconnectBlocks(VPBB, Successor);
+      VPBlockUtils::insertTwoBlocksAfter(EarlyExitVPBB, InLoopVPBB, VPBB);
+      VPBlockUtils::connectBlocks(InLoopVPBB, Successor);
+
+      VPBB = InLoopVPBB;
+    } else {
+      VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
+      VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
+    }
   }
 
   // After here, VPBB should not be used.
   VPBB = nullptr;
-
+  if (EarlyExitingBB)
+    addUsersInEarlyExitBlock(OrigLoop, EarlyExitingBB,
+                             Legal->getSpeculativeEarlyExitBlock(),
+                             RecipeBuilder, *Plan);
   if (CM.requiresScalarEpilogue(Range)) {
     // No edge from the middle block to the unique exit block has been inserted
     // and there is nothing to fix from vector loop; phis should have incoming
     // from scalar loop only.
   } else
-    addUsersInExitBlock(HeaderVPBB, OrigLoop, RecipeBuilder, *Plan);
+    addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan);
 
   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
@@ -8856,7 +9144,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
 
   // Create new empty VPlan
   auto Plan = VPlan::createInitialVPlan(
-      createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
+      createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop,
+                          Legal->hasSpeculativeEarlyExit()),
       *PSE.getSE());
 
   // Build hierarchical CFG
@@ -9580,7 +9869,8 @@ static bool processLoopInVPlanNativePath(
     bool AddBranchWeights =
         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
-                             F->getParent()->getDataLayout(), AddBranchWeights);
+                             F->getParent()->getDataLayout(), AddBranchWeights,
+                             false);
     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -9802,6 +10092,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
+  // We may not have LAI at this point.
+  if (LVL.hasSpeculativeEarlyExit() && LVL.getLAI()->mayFault()) {
+    if (!AssumeNoMemFault) {
+      LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot vectorize faulting "
+                        << "loop with early exit.\n");
+      return false;
+    }
+    LLVM_DEBUG(dbgs() << "LV: Assuming early exit vector loop will not "
+                      << "fault\n");
+  }
+
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
   // even evaluating whether vectorization is profitable. Since we cannot modify
@@ -9928,7 +10229,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   bool AddBranchWeights =
       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
-                           F->getParent()->getDataLayout(), AddBranchWeights);
+                           F->getParent()->getDataLayout(), AddBranchWeights,
+                           LVL.hasSpeculativeEarlyExit());
   if (MaybeVF) {
     VF = *MaybeVF;
     // Select the interleave count.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 999236ae84898b..f431fb1cb86710 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -489,8 +489,10 @@ void VPBasicBlock::execute(VPTransformState *State) {
     UnreachableInst *Terminator = State->Builder.CreateUnreachable();
     // Register NewBB in its loop. In innermost loops its the same for all
     // BB's.
-    if (State->CurrentVectorLoop)
+    if (State->CurrentVectorLoop &&
+        this != getPlan()->getVectorLoopRegion()->getEarlyExit()) {
       State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI);
+    }
     State->Builder.SetInsertPoint(Terminator);
     State->CFG.PrevBB = NewBB;
   }
@@ -750,6 +752,10 @@ VPlan::~VPlan() {
     delete KV.second;
   LiveOuts.clear();
 
+  for (auto &KV : EarlyExitLiveOuts)
+    delete KV.second;
+  EarlyExitLiveOuts.clear();
+
   if (Entry) {
     VPValue DummyValue;
     for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
@@ -891,12 +897,29 @@ void VPlan::execute(VPTransformState *State) {
     }
   }
 
+  BasicBlock *VectorEarlyExitBB = nullptr;
+  BasicBlock *VectorEarlyExitingBB = nullptr;
+  BasicBlock *OrigEarlyExitBB = nullptr;
+  if (getVectorLoopRegion()->getEarlyExit()) {
+    VPBasicBlock *EarlyExitVPBB =
+        cast<VPBasicBlock>(getVectorLoopRegion()->getEarlyExit());
+    VPBasicBlock *EarlyExitingVPBB =
+        cast<VPBasicBlock>(EarlyExitVPBB->getSinglePredecessor());
+    VectorEarlyExitBB = State->CFG.VPBB2IRBB[EarlyExitVPBB];
+    VectorEarlyExitingBB = State->CFG.VPBB2IRBB[EarlyExitingVPBB];
+    OrigEarlyExitBB = getVectorLoopRegion()->getOrigEarlyExit();
+    BranchInst *BI = BranchInst::Create(OrigEarlyExitBB);
+    BI->insertBefore(VectorEarlyExitBB->getTerminator());
+    VectorEarlyExitBB->getTerminator()->eraseFromParent();
+  }
+
   // We do not attempt to preserve DT for outer loop vectorization currently.
   if (!EnableVPlanNativePath) {
     BasicBlock *VectorHeaderBB = State->CFG.VPBB2IRBB[Header];
     State->DT->addNewBlock(VectorHeaderBB, VectorPreHeader);
     updateDominatorTree(State->DT, VectorHeaderBB, VectorLatchBB,
-                        State->CFG.ExitBB);
+                        State->CFG.ExitBB, VectorEarlyExitingBB,
+                        VectorEarlyExitBB, OrigEarlyExitBB);
   }
 }
 
@@ -954,6 +977,12 @@ void VPlan::print(raw_ostream &O) const {
     KV.second->print(O, SlotTracker);
   }
 
+  if (!EarlyExitLiveOuts.empty())
+    O << "\n";
+  for (const auto &KV : EarlyExitLiveOuts) {
+    KV.second->print(O, SlotTracker);
+  }
+
   O << "}\n";
 }
 
@@ -995,9 +1024,17 @@ void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
   LiveOuts.insert({PN, new VPLiveOut(PN, V)});
 }
 
+void VPlan::addEarlyExitLiveOut(PHINode *PN, VPValue *V) {
+  assert(EarlyExitLiveOuts.count(PN) == 0 &&
+         "an exit value for PN already exists");
+  EarlyExitLiveOuts.insert({PN, new VPLiveOut(PN, V, true)});
+}
+
 void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
-                                BasicBlock *LoopLatchBB,
-                                BasicBlock *LoopExitBB) {
+                                BasicBlock *LoopLatchBB, BasicBlock *LoopExitBB,
+                                BasicBlock *VectorLoopEarlyExitingBB,
+                                BasicBlock *VectorLoopEarlyExitBB,
+                                BasicBlock *OrigEarlyExitBB) {
   // The vector body may be more than a single basic-block by this point.
   // Update the dominator tree information inside the vector body by propagating
   // it from header to latch, expecting only triangular control-flow, if any.
@@ -1007,14 +1044,31 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
     std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
     assert(Succs.size() <= 2 &&
            "Basic block in vector loop has more than 2 successors.");
-    PostDomSucc = Succs[0];
     if (Succs.size() == 1) {
       assert(PostDomSucc->getSinglePredecessor() &&
              "PostDom successor has more than one predecessor.");
+      PostDomSucc = Succs[0];
       DT->addNewBlock(PostDomSucc, BB);
       continue;
     }
+
+    // Deal with early exits.
+    if (BB == VectorLoopEarlyExitingBB) {
+      BasicBlock *InLoopBB =
+          Succs[0] == VectorLoopEarlyExitBB ? Succs[1] : Succs[0];
+      assert(InLoopBB->getSinglePredecessor() &&
+             "Interim successor has more than one predecessor.");
+      assert(VectorLoopEarlyExitBB->getSinglePredecessor() &&
+             "Interim successor has more than one predecessor.");
+      DT->addNewBlock(VectorLoopEarlyExitBB, BB);
+      DT->addNewBlock(InLoopBB, BB);
+      DT->insertEdge(VectorLoopEarlyExitBB, OrigEarlyExitBB);
+      PostDomSucc = InLoopBB;
+      continue;
+    }
+
     BasicBlock *InterimSucc = Succs[1];
+    PostDomSucc = Succs[0];
     if (PostDomSucc->getSingleSuccessor() == InterimSucc) {
       PostDomSucc = Succs[1];
       InterimSucc = Succs[0];
@@ -1103,6 +1157,10 @@ VPlan *VPlan::duplicate() {
   for (const auto &[_, LO] : LiveOuts)
     NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
 
+  for (const auto &[_, LO] : EarlyExitLiveOuts)
+    NewPlan->addEarlyExitLiveOut(LO->getPhi(),
+                                 Old2NewVPValues[LO->getOperand(0)]);
+
   // Initialize remaining fields of cloned VPlan.
   NewPlan->VFs = VFs;
   NewPlan->UFs = UFs;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c74329a0bcc4ac..586c428cb8a082 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -79,7 +79,8 @@ Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
                        int64_t Step);
 
 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
-                                Loop *CurLoop = nullptr);
+                                Loop *CurLoop = nullptr,
+                                bool IsSpeculative = false);
 
 /// A range of powers-of-2 vectorization factors with fixed start and
 /// adjustable end. The range includes start and excludes end, e.g.,:
@@ -669,9 +670,14 @@ class VPBlockBase {
 class VPLiveOut : public VPUser {
   PHINode *Phi;
 
+  /// Is this a live-out value specifically for an early exit from the vector
+  /// loop? If so, it needs handling specially.
+  bool EarlyExit;
+
 public:
-  VPLiveOut(PHINode *Phi, VPValue *Op)
-      : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {}
+  VPLiveOut(PHINode *Phi, VPValue *Op, bool EarlyExit = false)
+      : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi),
+        EarlyExit(EarlyExit) {}
 
   static inline bool classof(const VPUser *U) {
     return U->getVPUserID() == VPUser::VPUserID::LiveOut;
@@ -1180,6 +1186,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
     // operand). Only generates scalar values (either for the first lane only or
     // for all lanes, depending on its uses).
     PtrAdd,
+    OrReduction,
   };
 
 private:
@@ -2938,7 +2945,9 @@ class VPBasicBlock : public VPBlockBase {
 };
 
 /// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
-/// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG.
+/// which form a Single-Entry-Single-Exiting or Single-Entry-Multiple-Exiting
+/// subgraph of the output IR CFG. For the multiple-exiting case we currently
+/// only support a total of two exits and the early exit is tracked separately.
 /// A VPRegionBlock may indicate that its contents are to be replicated several
 /// times. This is designed to support predicated scalarization, in which a
 /// scalar if-then code structure needs to be generated VF * UF times. Having
@@ -2953,6 +2962,18 @@ class VPRegionBlock : public VPBlockBase {
   /// VPRegionBlock.
   VPBlockBase *Exiting;
 
+  /// Hold the Early Exiting block of the SEME region, if one exists.
+  VPBlockBase *EarlyExit;
+
+  /// We need to keep track of the early exit block from the original scalar
+  /// loop in order to update the dominator tree correctly, since the vector
+  /// early exit will also jump to the original.
+  BasicBlock *OrigEarlyExit;
+
+  /// If one exists, this keeps track of the vector early mask that triggered
+  /// the early exit.
+  VPValue *VectorEarlyExitCond;
+
   /// An indicator whether this region is to generate multiple replicated
   /// instances of output IR corresponding to its VPBlockBases.
   bool IsReplicator;
@@ -2961,7 +2982,8 @@ class VPRegionBlock : public VPBlockBase {
   VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
                 const std::string &Name = "", bool IsReplicator = false)
       : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
-        IsReplicator(IsReplicator) {
+        EarlyExit(nullptr), OrigEarlyExit(nullptr),
+        VectorEarlyExitCond(nullptr), IsReplicator(IsReplicator) {
     assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
     assert(Exiting->getSuccessors().empty() && "Exit block has successors.");
     Entry->setParent(this);
@@ -2969,7 +2991,8 @@ class VPRegionBlock : public VPBlockBase {
   }
   VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
       : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
-        IsReplicator(IsReplicator) {}
+        EarlyExit(nullptr), OrigEarlyExit(nullptr),
+        VectorEarlyExitCond(nullptr), IsReplicator(IsReplicator) {}
 
   ~VPRegionBlock() override {
     if (Entry) {
@@ -2987,6 +3010,14 @@ class VPRegionBlock : public VPBlockBase {
   const VPBlockBase *getEntry() const { return Entry; }
   VPBlockBase *getEntry() { return Entry; }
 
+  /// Returns the early exit vector mask, if one exists.
+  Value *getEarlyExitMask(VPTransformState *State) {
+    return VectorEarlyExitCond ? State->get(VectorEarlyExitCond, 0) : nullptr;
+  }
+
+  /// Sets the early exit vector mask.
+  void setVectorEarlyExitCond(VPValue *V) { VectorEarlyExitCond = V; }
+
   /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
   /// EntryBlock must have no predecessors.
   void setEntry(VPBlockBase *EntryBlock) {
@@ -3008,6 +3039,22 @@ class VPRegionBlock : public VPBlockBase {
     ExitingBlock->setParent(this);
   }
 
+  void setEarlyExit(VPBlockBase *ExitBlock) {
+    assert(ExitBlock->getSuccessors().empty() &&
+           "Exit block cannot have successors.");
+    EarlyExit = ExitBlock;
+    ExitBlock->setParent(this);
+  }
+
+  const VPBlockBase *getEarlyExit() const { return EarlyExit; }
+  VPBlockBase *getEarlyExit() { return EarlyExit; }
+
+  void setOrigEarlyExit(BasicBlock *EarlyExitBlock) {
+    OrigEarlyExit = EarlyExitBlock;
+  }
+
+  BasicBlock *getOrigEarlyExit() { return OrigEarlyExit; }
+
   /// Returns the pre-header VPBasicBlock of the loop region.
   VPBasicBlock *getPreheaderVPBB() {
     assert(!isReplicator() && "should only get pre-header of loop regions");
@@ -3094,6 +3141,9 @@ class VPlan {
   /// Values used outside the plan.
   MapVector<PHINode *, VPLiveOut *> LiveOuts;
 
+  /// Values used outside the plan.
+  MapVector<PHINode *, VPLiveOut *> EarlyExitLiveOuts;
+
   /// Mapping from SCEVs to the VPValues representing their expansions.
   /// NOTE: This mapping is temporary and will be removed once all users have
   /// been modeled in VPlan directly.
@@ -3258,6 +3308,17 @@ class VPlan {
     return LiveOuts;
   }
 
+  void addEarlyExitLiveOut(PHINode *PN, VPValue *V);
+
+  void removeEarlyExitLiveOut(PHINode *PN) {
+    delete EarlyExitLiveOuts[PN];
+    EarlyExitLiveOuts.erase(PN);
+  }
+
+  const MapVector<PHINode *, VPLiveOut *> &getEarlyExitLiveOuts() const {
+    return EarlyExitLiveOuts;
+  }
+
   VPValue *getSCEVExpansion(const SCEV *S) const {
     return SCEVToExpansion.lookup(S);
   }
@@ -3280,7 +3341,10 @@ class VPlan {
   /// that was created between it and the latch block, inclusive.
   static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB,
                                   BasicBlock *LoopPreHeaderBB,
-                                  BasicBlock *LoopExitBB);
+                                  BasicBlock *LoopExitBB,
+                                  BasicBlock *VectorLoopEarlyExitingBB,
+                                  BasicBlock *VectorLoopEarlyExitBB,
+                                  BasicBlock *OrigEarlyExitBB);
 };
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c82..516d2b15914068 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -179,17 +179,33 @@ bool VPRecipeBase::mayHaveSideEffects() const {
 }
 
 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
-  auto Lane = VPLane::getLastLaneForVF(State.VF);
   VPValue *ExitValue = getOperand(0);
-  if (vputils::isUniformAfterVectorization(ExitValue))
-    Lane = VPLane::getFirstLane();
-  VPBasicBlock *MiddleVPBB =
-      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
-  assert(MiddleVPBB->getNumSuccessors() == 0 &&
-         "the middle block must not have any successors");
-  BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
-  Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
-                   MiddleBB);
+  VPLane Lane = VPLane::getFirstLane();
+  if (!vputils::isUniformAfterVectorization(ExitValue))
+    Lane = VPLane::getLastLaneForVF(State.VF);
+
+  VPBasicBlock *ExitVPBB;
+  if (EarlyExit)
+    ExitVPBB = cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEarlyExit());
+  else
+    ExitVPBB =
+        cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
+  assert(ExitVPBB->getNumSuccessors() == 0 &&
+         "the middle or early exit block must not have any successors");
+
+  Value *NewIncoming = nullptr;
+  if (!Lane.isFirstLane() && EarlyExit) {
+    assert(State.UF == 1 && "Early exits unsupported for unrolled loops");
+    NewIncoming = State.get(ExitValue, 0);
+    Value *EarlyExitMask = Plan.getVectorLoopRegion()->getEarlyExitMask(&State);
+    Value *Ctz = State.Builder.CreateCountTrailingZeroElems(
+        State.Builder.getInt64Ty(), EarlyExitMask);
+    NewIncoming = State.Builder.CreateExtractElement(NewIncoming, Ctz);
+  } else
+    NewIncoming = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
+
+  BasicBlock *ExitBB = State.CFG.VPBB2IRBB[ExitVPBB];
+  Phi->addIncoming(NewIncoming, ExitBB);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -282,6 +298,8 @@ bool VPInstruction::doesGeneratePerAllLanes() const {
   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
 }
 
+// TODO: Can this function be made static given it's only ever called from one
+// place in this file?
 bool VPInstruction::canGenerateScalarForFirstLane() const {
   if (Instruction::isBinaryOp(getOpcode()))
     return true;
@@ -292,6 +310,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ComputeReductionResult:
+  case VPInstruction::OrReduction:
   case VPInstruction::PtrAdd:
   case VPInstruction::ExplicitVectorLength:
     return true;
@@ -558,6 +577,10 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
     return Builder.CreatePtrAdd(Ptr, Addend, Name);
   }
+  case VPInstruction::OrReduction: {
+    Value *Val = State.get(getOperand(0), Part);
+    return Builder.CreateOrReduce(Val);
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -586,7 +609,8 @@ void VPInstruction::execute(VPTransformState &State) {
   bool GeneratesPerFirstLaneOnly =
       canGenerateScalarForFirstLane() &&
       (vputils::onlyFirstLaneUsed(this) ||
-       getOpcode() == VPInstruction::ComputeReductionResult);
+       getOpcode() == VPInstruction::ComputeReductionResult ||
+       getOpcode() == VPInstruction::OrReduction);
   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     if (GeneratesPerAllLanes) {
@@ -686,6 +710,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::PtrAdd:
     O << "ptradd";
     break;
+  case VPInstruction::OrReduction:
+    O << "or reduction";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d7bc128dcfe634..fa147d390f01ec 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -681,7 +681,9 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
 
   Type *IdxTy =
       Plan.getCanonicalIV()->getStartValue()->getLiveInIRValue()->getType();
-  const SCEV *TripCount = createTripCountSCEV(IdxTy, PSE);
+  // FIXME: Need to know if trip count is speculative or not.
+  const SCEV *TripCount = createTripCountSCEV(
+      IdxTy, PSE, nullptr, Plan.getVectorLoopRegion()->getEarlyExit());
   ScalarEvolution &SE = *PSE.getSE();
   ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
   const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
new file mode 100644
index 00000000000000..ded9ec6153b82d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -0,0 +1,2544 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S < %s -p loop-vectorize -mattr=+sve -mtriple aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,MAY_FAULT
+; RUN: opt -S < %s -p loop-vectorize -vectorizer-no-mem-fault -mattr=+sve -mtriple aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,NO_FAULT
+
+
+define i64 @same_exit_block_pre_inc_use1() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <vscale x 16 x i1> [[TMP32]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP33]])
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
+; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP37]]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 43, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_alloca_diff_type(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [40 x i32], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [40 x i32], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <vscale x 16 x i1> [[TMP32]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP33]])
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
+; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP37]]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [40 x i32]
+  %p2 = alloca [40 x i32]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 43, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p1, ptr dereferenceable(1024) %p2) {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs(
+; CHECK-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP2]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP33:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; CHECK-NEXT:    br i1 [[TMP33]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP36]]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 43, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use2() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <vscale x 16 x i1> [[TMP32]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP33]])
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 3, [[CMO]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP36:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP36]], [[TMP37]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ 43, [[LAND_RHS]] ], [ [[INDEX]], [[FOR_INC]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ], [ 43, [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 43, %loop ], [ %index, %loop.inc ]
+  ret i64 %retval
+}
+
+define i64 @same_exit_block_pre_inc_use3() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use3(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <vscale x 16 x i1> [[TMP32]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP33]])
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
+; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP37]]
+; CHECK-NEXT:    [[TMP38:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
+; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[TMP38]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE5:%.*]] = add i64 3, [[TMP39]]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 3, [[CMO]]
+; CHECK-NEXT:    [[CMO6:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE7:%.*]] = add i64 3, [[CMO6]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[FOR_INC]] ], [ [[INDEX]], [[LAND_RHS]] ], [ [[IND_EARLY_ESCAPE5]], [[VECTOR_EARLY_EXIT]] ], [ [[IND_ESCAPE7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[INDEX_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  ret i64 %index
+}
+
+
+; In this example the early exit block appears in the list of ExitNotTaken
+; SCEVs, but is not computable.
+define i64 @same_exit_block_pre_inc_use4() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use4(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i64], align 8
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i64], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 2 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 2 x i64> [[TMP9]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 1, [[TMP18]]
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP19]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_INC3]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT4]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul <vscale x 2 x i64> [[TMP15]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT5]], [[TMP16]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ult <vscale x 2 x i64> [[VEC_IND]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <vscale x 2 x i1> [[TMP22]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP23]])
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc4:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT3]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP23]], i1 true)
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[TMP36]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP26]]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[TMP28]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i64]
+  %p2 = alloca [1024 x i64]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i64, ptr %p1, i64 %index
+  %ld1 = load i64, ptr %arrayidx, align 1
+  %cmp3 = icmp ult i64 %index, %ld1
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 43, %loop.inc ]
+  ret i64 %retval
+}
+
+
+
+define i64 @same_exit_block_post_inc_use() {
+; CHECK-LABEL: define i64 @same_exit_block_post_inc_use(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <vscale x 16 x i1> [[TMP32]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP33]])
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP12]], 1
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
+; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP38]]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ [[INDEX_NEXT]], [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ %index.next, %loop.inc ]
+  ret i64 %retval
+}
+
+define i64 @same_exit_block_post_inc_use2() {
+; CHECK-LABEL: define i64 @same_exit_block_post_inc_use2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = add i64 [[TMP12]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <vscale x 16 x i1> [[TMP33]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP34]])
+; CHECK-NEXT:    br i1 [[TMP35]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP34]], i1 true)
+; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[INDEX1]]
+; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[TMP38]], 1
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP39]]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 3, [[CMO]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LAND_RHS]] ], [ [[INDEX]], [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %index.next = add i64 %index, 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index.next, %loop ], [ %index, %loop.inc ]
+  ret i64 %retval
+}
+
+define i64 @same_exit_block_phi_of_consts() {
+; CHECK-LABEL: define i64 @same_exit_block_phi_of_consts(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 16 x i1> [[TMP13]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP14]])
+; CHECK-NEXT:    br i1 [[TMP15]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ 0, [[LAND_RHS]] ], [ 1, [[FOR_INC]] ], [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 0, %loop ], [ 1, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @diff_exit_block_pre_inc_use1() {
+; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use1(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <vscale x 16 x i1> [[TMP32]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP33]])
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
+; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP37]]
+; CHECK-NEXT:    br label [[FOR_EARLY_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_EARLY_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_1_LCSSA:%.*]] = phi i64 [ 43, [[FOR_INC]] ], [ 43, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[START_1_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ %index, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ 43, %loop.inc ]
+  ret i64 %retval2
+}
+
+define i64 @diff_exit_block_pre_inc_use2() {
+; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <vscale x 16 x i1> [[TMP32]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP33]])
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[FOR_EARLY_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 3, [[CMO]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP36:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP36]], [[TMP37]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_EARLY_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ 43, [[LAND_RHS]] ], [ 43, [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_1_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[FOR_INC]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[START_1_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ 43, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ %index, %loop.inc ]
+  ret i64 %retval2
+}
+
+define i64 @diff_exit_block_pre_inc_use3() {
+; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use3(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT5:%.*]], [[FOR_INC4:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <vscale x 16 x i1> [[TMP32]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP33]])
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC4]]
+; CHECK:       loop.inc4:
+; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX2]], [[TMP7]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
+; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX2]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP37]]
+; CHECK-NEXT:    br label [[FOR_EARLY_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 3, [[CMO]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_EARLY_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[INDEX_LCSSA]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX]], [[FOR_INC]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[INDEX_LCSSA1]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  ret i64 %index
+
+loop.end:
+  ret i64 %index
+}
+
+
+define i64 @diff_exit_block_phi_of_consts() {
+; CHECK-LABEL: define i64 @diff_exit_block_phi_of_consts(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 16 x i1> [[TMP13]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP14]])
+; CHECK-NEXT:    br i1 [[TMP15]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[FOR_EARLY_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_EARLY_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    ret i64 0
+; CHECK:       loop.end:
+; CHECK-NEXT:    ret i64 1
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  ret i64 0
+
+loop.end:
+  ret i64 1
+}
+
+
+define i64 @diff_exit_block_post_inc_use1() {
+; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use1(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP31]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <vscale x 16 x i1> [[TMP32]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP33]])
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP12]], 1
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
+; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP38]]
+; CHECK-NEXT:    br label [[FOR_EARLY_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_EARLY_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_1_LCSSA:%.*]] = phi i64 [ [[INDEX_NEXT]], [[FOR_INC]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[START_1_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ %index, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ %index.next, %loop.inc ]
+  ret i64 %retval2
+}
+
+
+define i64 @diff_exit_block_post_inc_use2() {
+; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[TMP12]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP31]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP32]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <vscale x 16 x i1> [[TMP33]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP34]])
+; CHECK-NEXT:    br i1 [[TMP35]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP34]], i1 true)
+; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[INDEX1]]
+; CHECK-NEXT:    [[TMP39:%.*]] = add i64 [[TMP38]], 1
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP39]]
+; CHECK-NEXT:    br label [[FOR_EARLY_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 3, [[CMO]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_EARLY_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LAND_RHS]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_1_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[FOR_INC]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[START_1_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %index.next = add i64 %index, 1
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ %index.next, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ %index, %loop.inc ]
+  ret i64 %retval2
+}
+
+
+; The early exit (i.e. unknown exit-not-taken count) is the latch - we don't
+; support this yet.
+define i64 @early_exit_on_last_block() {
+; CHECK-LABEL: define i64 @early_exit_on_last_block(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[SEARCH:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       search:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_END_LOOPEXIT]], label [[LAND_RHS]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ 64, [[LAND_RHS]] ], [ [[INDEX]], [[SEARCH]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %search ], [ 3, %entry ]
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %search, label %loop.end
+
+search:
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.end, label %loop
+
+loop.end:
+  %retval = phi i64 [ 64, %loop ], [ %index, %search ]
+  ret i64 %retval
+}
+
+
+; There are multiple exit blocks - two of them have an exact representation for the
+; exit-not-taken counts and the other is unknown, i.e. the "early exit".
+define i64 @multiple_exits_one_early() {
+; CHECK-LABEL: define i64 @multiple_exits_one_early(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 62, [[TMP7]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 62, [[TMP9]]
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i64 [[TMP9]], i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 62, [[TMP44]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i64> [[TMP13]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP30:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP33]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP34]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP35]], align 1
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP37:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP36]])
+; CHECK-NEXT:    br i1 [[TMP37]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP11]]
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP36]], i1 true)
+; CHECK-NEXT:    [[TMP40:%.*]] = add i64 [[TMP39]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP40]]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
+; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH:%.*]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       search:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_END_LOOPEXIT]], label [[FOR_INC]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP33:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ 64, [[LAND_RHS]] ], [ [[INDEX]], [[SEARCH]] ], [ 128, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %cmp1 = icmp ne i64 %index, 64
+  br i1 %cmp1, label %search, label %loop.end
+
+search:
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.end, label %loop.inc
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 128
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 64, %loop ], [ %index, %search ], [ 128, %loop.inc ]
+  ret i64 %retval
+}
+
+
+; We don't currently support multiple early exits.
+define i64 @multiple_early_exits() {
+; CHECK-LABEL: define i64 @multiple_early_exits(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       search1:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_INC:%.*]]
+; CHECK:       search2:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 [[TMP41]], 34
+; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_END_LOOPEXIT]], label [[FOR_INC1]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 100, [[FOR_INC]] ], [ 43, [[FOR_INC1]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %search1
+
+search1:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp1 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp1, label %loop.end, label %search2
+
+search2:
+  %cmp2 = icmp ult i8 %ld1, 34
+  br i1 %cmp2, label %loop.end, label %loop.inc
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %search1, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %search1 ], [ 100, %search2 ], [ 43, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @early_exit_infinite_loop() {
+; CHECK-LABEL: define i64 @early_exit_infinite_loop(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br label [[LAND_RHS]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br label %loop
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use_inv_cond(i1 %cond) {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use_inv_cond(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i1> poison, i1 [[COND]], i64 0
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <vscale x 16 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 16 x i64> [[TMP9]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <vscale x 16 x i1> [[TMP32]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <vscale x 16 x i1> [[TMP33]], i32 0
+; CHECK-NEXT:    br i1 [[TMP34]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; CHECK:       loop.inc2:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP37:%.*]] = add i64 0, [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP37]]
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP35:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cond, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 43, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_with_reduction() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_with_reduction(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[RED_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[LD2_ZEXT:%.*]] = zext i8 [[TMP39]] to i64
+; CHECK-NEXT:    [[RED_NEXT]] = add i64 [[RED]], [[LD2_ZEXT]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], [[FOR_INC]] ], [ [[RED_NEXT]], [[LAND_RHS]] ]
+; CHECK-NEXT:    [[FINAL_IND:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ]
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = add i64 [[RED_NEXT_LCSSA]], [[FINAL_IND]]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %red = phi i64 [ %red.next, %loop.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %ld2.zext = zext i8 %ld2 to i64
+  %red.next = add i64 %red, %ld2.zext
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %final.ind = phi i64 [ %index, %loop ], [ 43, %loop.inc ]
+  %retval = add i64 %red.next, %final.ind
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
+; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas(
+; MAY_FAULT-SAME: ) #[[ATTR0]] {
+; MAY_FAULT-NEXT:  entry:
+; MAY_FAULT-NEXT:    [[P1:%.*]] = alloca [42 x i8], align 4
+; MAY_FAULT-NEXT:    [[P2:%.*]] = alloca [42 x i8], align 4
+; MAY_FAULT-NEXT:    br label [[LAND_RHS:%.*]]
+; MAY_FAULT:       loop:
+; MAY_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; MAY_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MAY_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; MAY_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP1]], [[TMP2]]
+; MAY_FAULT-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; MAY_FAULT:       loop.inc:
+; MAY_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; MAY_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; MAY_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; MAY_FAULT:       loop.end:
+; MAY_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ]
+; MAY_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas(
+; NO_FAULT-SAME: ) #[[ATTR0]] {
+; NO_FAULT-NEXT:  entry:
+; NO_FAULT-NEXT:    [[P1:%.*]] = alloca [42 x i8], align 4
+; NO_FAULT-NEXT:    [[P2:%.*]] = alloca [42 x i8], align 4
+; NO_FAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 16
+; NO_FAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP2]]
+; NO_FAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO_FAULT:       vector.ph:
+; NO_FAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; NO_FAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP4]]
+; NO_FAULT-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; NO_FAULT-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; NO_FAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; NO_FAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO_FAULT:       vector.body:
+; NO_FAULT-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; NO_FAULT-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; NO_FAULT-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; NO_FAULT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; NO_FAULT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; NO_FAULT-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; NO_FAULT-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; NO_FAULT-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; NO_FAULT-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; NO_FAULT-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; NO_FAULT-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; NO_FAULT-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; NO_FAULT-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; NO_FAULT-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; NO_FAULT-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; NO_FAULT-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; NO_FAULT-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; NO_FAULT-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; NO_FAULT-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; NO_FAULT-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; NO_FAULT-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; NO_FAULT-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; NO_FAULT-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; NO_FAULT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; NO_FAULT-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; NO_FAULT-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; NO_FAULT-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP33:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; NO_FAULT-NEXT:    br i1 [[TMP33]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; NO_FAULT:       loop.inc3:
+; NO_FAULT-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; NO_FAULT-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; NO_FAULT:       vector.early.exit:
+; NO_FAULT-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; NO_FAULT-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
+; NO_FAULT-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP36]]
+; NO_FAULT-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; NO_FAULT:       middle.block:
+; NO_FAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; NO_FAULT:       scalar.ph:
+; NO_FAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; NO_FAULT-NEXT:    br label [[LAND_RHS:%.*]]
+; NO_FAULT:       loop:
+; NO_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; NO_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; NO_FAULT-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; NO_FAULT:       loop.inc:
+; NO_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; NO_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; NO_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP37:![0-9]+]]
+; NO_FAULT:       loop.end:
+; NO_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
+; NO_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [42 x i8]
+  %p2 = alloca [42 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 43, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) {
+; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(
+; MAY_FAULT-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) #[[ATTR0]] {
+; MAY_FAULT-NEXT:  entry:
+; MAY_FAULT-NEXT:    br label [[LAND_RHS:%.*]]
+; MAY_FAULT:       loop:
+; MAY_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; MAY_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MAY_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; MAY_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP1]], [[TMP2]]
+; MAY_FAULT-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; MAY_FAULT:       loop.inc:
+; MAY_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; MAY_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; MAY_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; MAY_FAULT:       loop.end:
+; MAY_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ]
+; MAY_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(
+; NO_FAULT-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) #[[ATTR0]] {
+; NO_FAULT-NEXT:  entry:
+; NO_FAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 16
+; NO_FAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP2]]
+; NO_FAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO_FAULT:       vector.ph:
+; NO_FAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; NO_FAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP4]]
+; NO_FAULT-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; NO_FAULT-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; NO_FAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; NO_FAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO_FAULT:       vector.body:
+; NO_FAULT-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; NO_FAULT-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; NO_FAULT-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; NO_FAULT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; NO_FAULT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; NO_FAULT-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; NO_FAULT-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; NO_FAULT-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; NO_FAULT-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; NO_FAULT-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; NO_FAULT-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; NO_FAULT-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; NO_FAULT-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; NO_FAULT-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; NO_FAULT-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; NO_FAULT-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; NO_FAULT-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; NO_FAULT-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; NO_FAULT-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; NO_FAULT-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; NO_FAULT-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; NO_FAULT-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; NO_FAULT-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; NO_FAULT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; NO_FAULT-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; NO_FAULT-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; NO_FAULT-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP33:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; NO_FAULT-NEXT:    br i1 [[TMP33]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; NO_FAULT:       loop.inc3:
+; NO_FAULT-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; NO_FAULT-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; NO_FAULT:       vector.early.exit:
+; NO_FAULT-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; NO_FAULT-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
+; NO_FAULT-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP36]]
+; NO_FAULT-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; NO_FAULT:       middle.block:
+; NO_FAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; NO_FAULT:       scalar.ph:
+; NO_FAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; NO_FAULT-NEXT:    br label [[LAND_RHS:%.*]]
+; NO_FAULT:       loop:
+; NO_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; NO_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; NO_FAULT-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; NO_FAULT:       loop.inc:
+; NO_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; NO_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; NO_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; NO_FAULT:       loop.end:
+; NO_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
+; NO_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 43, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) {
+; MAY_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(
+; MAY_FAULT-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
+; MAY_FAULT-NEXT:  entry:
+; MAY_FAULT-NEXT:    br label [[LAND_RHS:%.*]]
+; MAY_FAULT:       loop:
+; MAY_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; MAY_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; MAY_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; MAY_FAULT-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; MAY_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP1]], [[TMP2]]
+; MAY_FAULT-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; MAY_FAULT:       loop.inc:
+; MAY_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; MAY_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; MAY_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; MAY_FAULT:       loop.end:
+; MAY_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ]
+; MAY_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+; NO_FAULT-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(
+; NO_FAULT-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
+; NO_FAULT-NEXT:  entry:
+; NO_FAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 16
+; NO_FAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP2]]
+; NO_FAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO_FAULT:       vector.ph:
+; NO_FAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; NO_FAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP4]]
+; NO_FAULT-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; NO_FAULT-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; NO_FAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; NO_FAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; NO_FAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO_FAULT:       vector.body:
+; NO_FAULT-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[FOR_INC3:%.*]] ]
+; NO_FAULT-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; NO_FAULT-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; NO_FAULT-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; NO_FAULT-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; NO_FAULT-NEXT:    [[TMP8:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP7]]
+; NO_FAULT-NEXT:    [[TMP9:%.*]] = mul <vscale x 16 x i64> [[TMP8]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP10:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP9]]
+; NO_FAULT-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; NO_FAULT-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 1
+; NO_FAULT-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 2
+; NO_FAULT-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 3
+; NO_FAULT-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 4
+; NO_FAULT-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 5
+; NO_FAULT-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 6
+; NO_FAULT-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 7
+; NO_FAULT-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 8
+; NO_FAULT-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 9
+; NO_FAULT-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 10
+; NO_FAULT-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 11
+; NO_FAULT-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 12
+; NO_FAULT-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 13
+; NO_FAULT-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 14
+; NO_FAULT-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 15
+; NO_FAULT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP27]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; NO_FAULT-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP11]]
+; NO_FAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; NO_FAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP30]], align 1
+; NO_FAULT-NEXT:    [[TMP31:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; NO_FAULT-NEXT:    [[TMP32:%.*]] = xor <vscale x 16 x i1> [[TMP31]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; NO_FAULT-NEXT:    [[TMP33:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
+; NO_FAULT-NEXT:    br i1 [[TMP33]], label [[VECTOR_EARLY_EXIT:%.*]], label [[FOR_INC3]]
+; NO_FAULT:       loop.inc3:
+; NO_FAULT-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
+; NO_FAULT-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; NO_FAULT:       vector.early.exit:
+; NO_FAULT-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; NO_FAULT-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
+; NO_FAULT-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP36]]
+; NO_FAULT-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; NO_FAULT:       middle.block:
+; NO_FAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; NO_FAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; NO_FAULT:       scalar.ph:
+; NO_FAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; NO_FAULT-NEXT:    br label [[LAND_RHS:%.*]]
+; NO_FAULT:       loop:
+; NO_FAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO_FAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO_FAULT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; NO_FAULT-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; NO_FAULT-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
+; NO_FAULT-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
+; NO_FAULT:       loop.inc:
+; NO_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; NO_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; NO_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP41:![0-9]+]]
+; NO_FAULT:       loop.end:
+; NO_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
+; NO_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 43, %loop.inc ]
+  ret i64 %retval
+}
+;.
+; MAY_FAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; MAY_FAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; MAY_FAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; MAY_FAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]}
+;.
+; NO_FAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO_FAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO_FAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO_FAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP36]] = distinct !{[[LOOP36]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP37]] = distinct !{[[LOOP37]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP38]] = distinct !{[[LOOP38]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP39]] = distinct !{[[LOOP39]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP40]] = distinct !{[[LOOP40]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP41]] = distinct !{[[LOOP41]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/control-flow.ll b/llvm/test/Transforms/LoopVectorize/control-flow.ll
index a27f2f0841bca8..27b70664863182 100644
--- a/llvm/test/Transforms/LoopVectorize/control-flow.ll
+++ b/llvm/test/Transforms/LoopVectorize/control-flow.ll
@@ -10,7 +10,7 @@
 ;   return 0;
 ; }
 
-; CHECK: remark: source.cpp:5:9: loop not vectorized: could not determine number of loop iterations
+; CHECK: remark: source.cpp:8:7: loop not vectorized: cannot vectorize stores in early exit loop
 ; CHECK: remark: source.cpp:5:9: loop not vectorized
 
 ; CHECK: _Z4testPii

>From 813d6934435411785b95a605b41953c27ab9f34a Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 22 Apr 2024 12:56:53 +0000
Subject: [PATCH 2/5] Address review comments

* Renamed CountableEarlyExitBlocks -> CountableEarlyExitingBlocks
* Renamed getExactExitingBlocks -> getCountableExitingBlocks
* Updated comments in code.
* Improved analysis/debug message.
* Simplified code in isAnalyzableEarlyExitLoop and
BackedgeTakenInfo::getSpeculative.
---
 .../llvm/Analysis/LoopAccessAnalysis.h        |  4 +--
 llvm/include/llvm/Analysis/ScalarEvolution.h  | 10 ++++----
 llvm/include/llvm/Support/GenericLoopInfo.h   |  4 +--
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 25 ++++++++-----------
 llvm/lib/Analysis/ScalarEvolution.cpp         |  7 ++----
 llvm/lib/Transforms/Vectorize/VPlan.h         |  2 +-
 .../Transforms/LoopVectorize/control-flow.ll  |  2 +-
 7 files changed, 23 insertions(+), 31 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index d79c53f490c927..7c305f5cca83d6 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -626,7 +626,7 @@ class LoopAccessInfo {
   /// Returns all blocks with a countable exit, i.e. the exit-not-taken count
   /// is known exactly at compile time.
   const SmallVector<BasicBlock *, 4> &getCountableEarlyExitingBlocks() const {
-    return CountableEarlyExitBlocks;
+    return CountableEarlyExitingBlocks;
   }
 
   /// The diagnostics report generated for the analysis.  E.g. why we
@@ -731,7 +731,7 @@ class LoopAccessInfo {
 
   /// Keeps track of all the early exits with known or countable exit-not-taken
   /// counts.
-  SmallVector<BasicBlock *, 4> CountableEarlyExitBlocks;
+  SmallVector<BasicBlock *, 4> CountableEarlyExitingBlocks;
 
   /// Indicator that there are non vectorizable stores to a uniform address.
   bool HasDependenceInvolvingLoopInvariantAddress = false;
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 562deab8b4159e..6bba10d83ad507 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -917,9 +917,9 @@ class ScalarEvolution {
   }
 
   /// Return all the exiting blocks in with exact exit counts.
-  void getExactExitingBlocks(const Loop *L,
-                             SmallVector<BasicBlock *, 4> *Blocks) {
-    getBackedgeTakenInfo(L).getExactExitingBlocks(L, this, Blocks);
+  void getCountableExitingBlocks(const Loop *L,
+                                 SmallVector<BasicBlock *, 4> *Blocks) {
+    getBackedgeTakenInfo(L).getCountableExitingBlocks(L, this, Blocks);
   }
 
   /// Return true if the backedge taken count is either the value returned by
@@ -1562,8 +1562,8 @@ class ScalarEvolution {
                          ScalarEvolution *SE) const;
 
     /// Return all the exiting blocks in with exact exit counts.
-    void getExactExitingBlocks(const Loop *L, ScalarEvolution *SE,
-                               SmallVector<BasicBlock *, 4> *Blocks) const;
+    void getCountableExitingBlocks(const Loop *L, ScalarEvolution *SE,
+                                   SmallVector<BasicBlock *, 4> *Blocks) const;
 
     /// Get the constant max backedge taken count for the loop.
     const SCEV *getConstantMax(ScalarEvolution *SE) const;
diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h
index 83cacf864089cc..4d474fd272bb25 100644
--- a/llvm/include/llvm/Support/GenericLoopInfo.h
+++ b/llvm/include/llvm/Support/GenericLoopInfo.h
@@ -294,8 +294,8 @@ template <class BlockT, class LoopT> class LoopBase {
   /// Otherwise return null.
   BlockT *getUniqueExitBlock() const;
 
-  /// Return the exit block for the latch if one exists. This function assumes
-  /// the loop has a latch.
+  /// Return the exit block for the latch. This function assumes the loop has a
+  /// single latch.
   BlockT *getLatchExitBlock() const;
 
   /// Return true if this loop does not have any exit blocks.
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index a49a97402786ae..bf21d494fec7d2 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2419,11 +2419,11 @@ bool LoopAccessInfo::isAnalyzableEarlyExitLoop() {
   if (ExitingBlocks.size() < 2)
     return false;
 
-  SmallVector<BasicBlock *, 4> ExactExitingBlocks;
-  PSE->getSE()->getExactExitingBlocks(TheLoop, &ExactExitingBlocks);
+  SmallVector<BasicBlock *, 4> CountableExitingBBs;
+  PSE->getSE()->getCountableExitingBlocks(TheLoop, &CountableExitingBBs);
 
   // We only support one speculative early exit.
-  if ((ExitingBlocks.size() - ExactExitingBlocks.size()) > 1)
+  if ((ExitingBlocks.size() - CountableExitingBBs.size()) > 1)
     return false;
 
   // There could be multiple exiting blocks with an exact exit-not-taken
@@ -2431,13 +2431,7 @@ bool LoopAccessInfo::isAnalyzableEarlyExitLoop() {
   // unknown count.
   BasicBlock *TmpBB = nullptr;
   for (BasicBlock *BB1 : ExitingBlocks) {
-    bool Found = false;
-    for (BasicBlock *BB2 : ExactExitingBlocks)
-      if (BB1 == BB2) {
-        Found = true;
-        break;
-      }
-    if (!Found) {
+    if (!is_contained(CountableExitingBBs, BB1)) {
       TmpBB = BB1;
       break;
     }
@@ -2468,7 +2462,7 @@ bool LoopAccessInfo::isAnalyzableEarlyExitLoop() {
     }
   assert(SpeculativeEarlyExitBB &&
          "Expected to find speculative early exit block");
-  CountableEarlyExitBlocks = std::move(ExactExitingBlocks);
+  CountableEarlyExitingBlocks = std::move(CountableExitingBBs);
 
   return true;
 }
@@ -2626,9 +2620,11 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
       if (I.mayWriteToMemory()) {
         auto *St = dyn_cast<StoreInst>(&I);
         if (SpeculativeEarlyExitingBB) {
-          recordAnalysis("CantVectorizeInstruction", St)
-              << "cannot vectorize stores in early exit loop";
-          LLVM_DEBUG(dbgs() << "LAA: Found a store in early exit loop.\n");
+          recordAnalysis("CantVectorizeInstruction", &I)
+              << "cannot vectorize instructions that write to memory in early "
+              << "exit loop";
+          LLVM_DEBUG(dbgs() << "LAA: Found an instruction that writes to "
+                            << "memory in early exit loop.\n");
           HasComplexWorkInEarlyExitLoop = true;
           continue;
         }
@@ -2784,7 +2780,6 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   Accesses.buildDependenceSets();
 
   if (SpeculativeEarlyExitingBB) {
-    assert(!Stores.size() && "Did not expect stores in an early exit loop!");
     LoopMayFault = Accesses.mayFault();
     CanVecMem = true;
     return;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 9bb4a3f7fca2c0..e11e8d0fb811b3 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -8602,7 +8602,7 @@ ScalarEvolution::BackedgeTakenInfo::getExact(const Loop *L, ScalarEvolution *SE,
   return SE->getUMinFromMismatchedTypes(Ops, /* Sequential */ true);
 }
 
-void ScalarEvolution::BackedgeTakenInfo::getExactExitingBlocks(
+void ScalarEvolution::BackedgeTakenInfo::getCountableExitingBlocks(
     const Loop *L, ScalarEvolution *SE,
     SmallVector<BasicBlock *, 4> *Blocks) const {
   // All exiting blocks we have collected must dominate the only backedge.
@@ -8625,10 +8625,7 @@ const SCEV *ScalarEvolution::BackedgeTakenInfo::getSpeculative(
     SmallVector<const SCEVPredicate *, 4> *Preds) const {
   // All exiting blocks we have collected must dominate the only backedge.
   const BasicBlock *Latch = L->getLoopLatch();
-  if (!Latch)
-    return SE->getCouldNotCompute();
-
-  if (!hasAnyInfo())
+  if (!Latch || !hasAnyInfo())
     return SE->getCouldNotCompute();
 
   // All exiting blocks we have gathered dominate loop's latch, so speculative
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 586c428cb8a082..5614e5e9c1181d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2962,7 +2962,7 @@ class VPRegionBlock : public VPBlockBase {
   /// VPRegionBlock.
   VPBlockBase *Exiting;
 
-  /// Hold the Early Exiting block of the SEME region, if one exists.
+  /// Hold the Early Exit block of the SEME region, if one exists.
   VPBlockBase *EarlyExit;
 
   /// We need to keep track of the early exit block from the original scalar
diff --git a/llvm/test/Transforms/LoopVectorize/control-flow.ll b/llvm/test/Transforms/LoopVectorize/control-flow.ll
index 27b70664863182..926e6db3d04ab9 100644
--- a/llvm/test/Transforms/LoopVectorize/control-flow.ll
+++ b/llvm/test/Transforms/LoopVectorize/control-flow.ll
@@ -10,7 +10,7 @@
 ;   return 0;
 ; }
 
-; CHECK: remark: source.cpp:8:7: loop not vectorized: cannot vectorize stores in early exit loop
+; CHECK: remark: source.cpp:8:7: loop not vectorized: cannot vectorize instructions that write to memory in early exit loop
 ; CHECK: remark: source.cpp:5:9: loop not vectorized
 
 ; CHECK: _Z4testPii

>From 2425c69261b4ed7dbd1e2cf4d7c7f912b44143c1 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 24 Apr 2024 09:06:14 +0000
Subject: [PATCH 3/5] Remove getSpeculative in favour of getSymbolicMax

* I've rewritten the loop variant of BackedgeTakenInfo::getSymbolicMax
to be more consistent with BackedgeTakenInfo::getExact so that it
now also accepts predicates.
* I've changed getPredicatedBackedgeTakenCount to use getSymbolicMax,
although we still require the latch block to have an exact
exit-not-taken count.
---
 llvm/include/llvm/Analysis/ScalarEvolution.h |  34 ++---
 llvm/lib/Analysis/ScalarEvolution.cpp        | 125 ++++++++-----------
 2 files changed, 63 insertions(+), 96 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 6bba10d83ad507..309396207c6274 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -893,9 +893,10 @@ class ScalarEvolution {
   /// SCEV predicates to Predicates that are required to be true in order for
   /// the answer to be correct. Predicates can be checked with run-time
   /// checks and can be used to perform loop versioning. If \p Speculative is
-  /// true, this will attempt to return the speculative backedge count for loops
-  /// with early exits. However, this is only possible if we can formulate an
-  /// exact expression for the backedge count from the latch block.
+  /// true, this will attempt to return the speculative - a restricted variant
+  /// of the symbolic maximum - backedge count for loops with early exits.
+  /// However, this is only possible if we can formulate an exact expression for
+  /// the backedge count from the latch block.
   const SCEV *getPredicatedBackedgeTakenCount(
       const Loop *L, SmallVector<const SCEVPredicate *, 4> &Predicates,
       bool Speculative = false);
@@ -1492,10 +1493,6 @@ class ScalarEvolution {
     /// the loop.
     bool IsComplete = false;
 
-    /// Expression indicating the least maximum backedge-taken count of the loop
-    /// that is known, or a SCEVCouldNotCompute. Lazily computed on first query.
-    const SCEV *SymbolicMax = nullptr;
-
     /// True iff the backedge is taken either exactly Max or zero times.
     bool MaxOrZero = false;
 
@@ -1544,16 +1541,6 @@ class ScalarEvolution {
     const SCEV *getExact(const Loop *L, ScalarEvolution *SE,
                          SmallVector<const SCEVPredicate *, 4> *Predicates = nullptr) const;
 
-    /// Similar to the above, except we permit unknown exit counts from
-    /// non-latch exit blocks. Any such early exit blocks must dominate the
-    /// latch and so the returned expression represents the speculative, or
-    /// maximum possible, *backedge-taken* count of the loop. If there is no
-    /// exact exit count for the latch this function returns
-    /// SCEVCouldNotCompute.
-    const SCEV *getSpeculative(
-        const Loop *L, ScalarEvolution *SE,
-        SmallVector<const SCEVPredicate *, 4> *Predicates = nullptr) const;
-
     /// Return the number of times this loop exit may fall through to the back
     /// edge, or SCEVCouldNotCompute. The loop is guaranteed not to exit via
     /// this block before this number of iterations, but may exit via another
@@ -1573,7 +1560,9 @@ class ScalarEvolution {
                                ScalarEvolution *SE) const;
 
     /// Get the symbolic max backedge taken count for the loop.
-    const SCEV *getSymbolicMax(const Loop *L, ScalarEvolution *SE);
+    const SCEV *getSymbolicMax(
+        const Loop *L, ScalarEvolution *SE,
+        SmallVector<const SCEVPredicate *, 4> *Predicates = nullptr) const;
 
     /// Get the symbolic max backedge taken count for the particular loop exit.
     const SCEV *getSymbolicMax(const BasicBlock *ExitingBlock,
@@ -1582,6 +1571,10 @@ class ScalarEvolution {
     /// Return true if the number of times this backedge is taken is either the
     /// value returned by getConstantMax or zero.
     bool isConstantMaxOrZero(ScalarEvolution *SE) const;
+
+    /// Return true if we have an exact exit-not-taken count for the exiting
+    /// block.
+    bool hasExact(const BasicBlock *ExitingBlock, ScalarEvolution *SE) const;
   };
 
   /// Cache the backedge-taken count of the loops for this function as they
@@ -1785,11 +1778,6 @@ class ScalarEvolution {
   ExitLimit computeExitLimit(const Loop *L, BasicBlock *ExitingBlock,
                              bool AllowPredicates = false);
 
-  /// Return a symbolic upper bound for the backedge taken count of the loop.
-  /// This is more general than getConstantMaxBackedgeTakenCount as it returns
-  /// an arbitrary expression as opposed to only constants.
-  const SCEV *computeSymbolicMaxBackedgeTakenCount(const Loop *L);
-
   // Helper functions for computeExitLimitFromCond to avoid exponential time
   // complexity.
 
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index e11e8d0fb811b3..ef032ad78238ed 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -8279,10 +8279,14 @@ const SCEV *ScalarEvolution::getExitCount(const Loop *L,
 const SCEV *ScalarEvolution::getPredicatedBackedgeTakenCount(
     const Loop *L, SmallVector<const SCEVPredicate *, 4> &Preds,
     bool Speculative) {
-  if (Speculative)
-    return getPredicatedBackedgeTakenInfo(L).getSpeculative(L, this, &Preds);
-  else
-    return getPredicatedBackedgeTakenInfo(L).getExact(L, this, &Preds);
+  const BackedgeTakenInfo &BTI = getPredicatedBackedgeTakenInfo(L);
+  if (Speculative) {
+    const BasicBlock *Latch = L->getLoopLatch();
+    if (!Latch || !BTI.hasExact(Latch, this))
+      return getCouldNotCompute();
+    return BTI.getSymbolicMax(L, this, &Preds);
+  } else
+    return BTI.getExact(L, this, &Preds);
 }
 
 const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L,
@@ -8620,45 +8624,6 @@ void ScalarEvolution::BackedgeTakenInfo::getCountableExitingBlocks(
   return;
 }
 
-const SCEV *ScalarEvolution::BackedgeTakenInfo::getSpeculative(
-    const Loop *L, ScalarEvolution *SE,
-    SmallVector<const SCEVPredicate *, 4> *Preds) const {
-  // All exiting blocks we have collected must dominate the only backedge.
-  const BasicBlock *Latch = L->getLoopLatch();
-  if (!Latch || !hasAnyInfo())
-    return SE->getCouldNotCompute();
-
-  // All exiting blocks we have gathered dominate loop's latch, so speculative
-  // trip count is simply a minimum out of all these calculated exit counts.
-  SmallVector<const SCEV *, 2> Ops;
-  bool FoundLatch = false;
-  for (const auto &ENT : ExitNotTaken) {
-    const SCEV *BECount = ENT.ExactNotTaken;
-    if (BECount == SE->getCouldNotCompute())
-      continue;
-
-    assert(SE->DT.dominates(ENT.ExitingBlock, Latch) &&
-           "We should only have known counts for exiting blocks that dominate "
-           "latch!");
-    Ops.push_back(BECount);
-    if (Preds)
-      for (const auto *P : ENT.Predicates)
-        Preds->push_back(P);
-    assert((Preds || ENT.hasAlwaysTruePredicate()) &&
-           "Predicate should be always true!");
-    if (ENT.ExitingBlock == Latch)
-      FoundLatch = true;
-  }
-
-  if (!FoundLatch)
-    return SE->getCouldNotCompute();
-
-  // If an earlier exit exits on the first iteration (exit count zero), then
-  // a later poison exit count should not propagate into the result. This are
-  // exactly the semantics provided by umin_seq.
-  return SE->getUMinFromMismatchedTypes(Ops, /* Sequential */ true);
-}
-
 /// Get the exact not taken count for this loop exit.
 const SCEV *
 ScalarEvolution::BackedgeTakenInfo::getExact(const BasicBlock *ExitingBlock,
@@ -8670,6 +8635,15 @@ ScalarEvolution::BackedgeTakenInfo::getExact(const BasicBlock *ExitingBlock,
   return SE->getCouldNotCompute();
 }
 
+bool ScalarEvolution::BackedgeTakenInfo::hasExact(
+    const BasicBlock *ExitingBlock, ScalarEvolution *SE) const {
+  for (const auto &ENT : ExitNotTaken)
+    if (ENT.ExitingBlock == ExitingBlock)
+      return ENT.ExactNotTaken != SE->getCouldNotCompute();
+
+  return false;
+}
+
 const SCEV *ScalarEvolution::BackedgeTakenInfo::getConstantMax(
     const BasicBlock *ExitingBlock, ScalarEvolution *SE) const {
   for (const auto &ENT : ExitNotTaken)
@@ -8704,12 +8678,41 @@ ScalarEvolution::BackedgeTakenInfo::getConstantMax(ScalarEvolution *SE) const {
   return getConstantMax();
 }
 
-const SCEV *
-ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(const Loop *L,
-                                                   ScalarEvolution *SE) {
-  if (!SymbolicMax)
-    SymbolicMax = SE->computeSymbolicMaxBackedgeTakenCount(L);
-  return SymbolicMax;
+const SCEV *ScalarEvolution::BackedgeTakenInfo::getSymbolicMax(
+    const Loop *L, ScalarEvolution *SE,
+    SmallVector<const SCEVPredicate *, 4> *Preds) const {
+  // If any exits were not computable, the loop is not computable.
+  if (ExitNotTaken.empty())
+    return SE->getCouldNotCompute();
+
+  const BasicBlock *Latch = L->getLoopLatch();
+  // All exiting blocks we have collected must dominate the only backedge.
+  if (!Latch)
+    return SE->getCouldNotCompute();
+
+  // Form an expression for the maximum exit count possible for this loop. We
+  // merge the max and exact information to approximate a version of
+  // getConstantMaxBackedgeTakenCount which isn't restricted to just constants.
+  SmallVector<const SCEV *, 4> ExitCounts;
+  for (const auto &ENT : ExitNotTaken) {
+    const SCEV *ExitCount = ENT.SymbolicMaxNotTaken;
+    if (ExitCount == SE->getCouldNotCompute())
+      continue;
+
+    assert(SE->DT.dominates(ENT.ExitingBlock, Latch) &&
+           "We should only have known counts for exiting blocks that dominate "
+           "the latch!");
+    ExitCounts.push_back(ExitCount);
+    if (Preds)
+      for (const auto *P : ENT.Predicates)
+        Preds->push_back(P);
+    assert((Preds || ENT.hasAlwaysTruePredicate()) &&
+           "Predicate should be always true!");
+  }
+
+  if (ExitCounts.empty())
+    return SE->getCouldNotCompute();
+  return SE->getUMinFromMismatchedTypes(ExitCounts, /*Sequential*/ true);
 }
 
 bool ScalarEvolution::BackedgeTakenInfo::isConstantMaxOrZero(
@@ -15024,30 +15027,6 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
   return false;
 }
 
-const SCEV *
-ScalarEvolution::computeSymbolicMaxBackedgeTakenCount(const Loop *L) {
-  SmallVector<BasicBlock*, 16> ExitingBlocks;
-  L->getExitingBlocks(ExitingBlocks);
-
-  // Form an expression for the maximum exit count possible for this loop. We
-  // merge the max and exact information to approximate a version of
-  // getConstantMaxBackedgeTakenCount which isn't restricted to just constants.
-  SmallVector<const SCEV*, 4> ExitCounts;
-  for (BasicBlock *ExitingBB : ExitingBlocks) {
-    const SCEV *ExitCount =
-        getExitCount(L, ExitingBB, ScalarEvolution::SymbolicMaximum);
-    if (!isa<SCEVCouldNotCompute>(ExitCount)) {
-      assert(DT.dominates(ExitingBB, L->getLoopLatch()) &&
-             "We should only have known counts for exiting blocks that "
-             "dominate latch!");
-      ExitCounts.push_back(ExitCount);
-    }
-  }
-  if (ExitCounts.empty())
-    return getCouldNotCompute();
-  return getUMinFromMismatchedTypes(ExitCounts, /*Sequential*/ true);
-}
-
 /// A rewriter to replace SCEV expressions in Map with the corresponding entry
 /// in the map. It skips AddRecExpr because we cannot guarantee that the
 /// replacement is loop invariant in the loop of the AddRec.

>From bc36e825a90fae83d86d42665f0baaf5e721f292 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 24 Apr 2024 11:46:47 +0000
Subject: [PATCH 4/5] Make sure to call fixPhi on live-outs

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4cb5926503581b..a33fb94ae7d75a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3616,15 +3616,12 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
   State.Builder.SetInsertPoint(State.CFG.ExitBB,
                                State.CFG.ExitBB->getFirstNonPHIIt());
   for (const auto &KV : Plan.getLiveOuts())
-    if (OrigLoop->getUniqueExitBlock() ||
-        KV.second->getPhi()->getParent() != OrigEarlyExitBB)
-      KV.second->fixPhi(Plan, State);
+    KV.second->fixPhi(Plan, State);
 
   if (VectorEarlyExitBB) {
     State.Builder.SetInsertPoint(VectorEarlyExitBB->getTerminator());
     for (const auto &KV : Plan.getEarlyExitLiveOuts())
-      if (KV.second->getPhi()->getParent() == OrigEarlyExitBB)
-        KV.second->fixPhi(Plan, State);
+      KV.second->fixPhi(Plan, State);
   }
 
   for (Instruction *PI : PredicatedInstructions)

>From 24d538e3cc02ed38f4aa5d748a602156220825e2 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 1 May 2024 08:32:26 +0000
Subject: [PATCH 5/5] Address review comments

* Rename CountableEarlyExitingBlocks -> CountableExitingBlocks.
* Add another simple case to mayFault along with a supporting
test.
* Renamed areRuntimeChecksProfitable -> isOutsideLoopWorkProfitable
and added the cost of work in the early exit block.
* Added new flag to control early-exit vectorisation
(EnableEarlyExitVectorization), which is off by default until
we have a more accurate cost model for the cttz.elts intrinsic.
---
 .../llvm/Analysis/LoopAccessAnalysis.h        |   8 +-
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |  30 ++-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 106 ++++++++--
 .../AArch64/simple_early_exit.ll              | 196 ++++++++++++++----
 4 files changed, 273 insertions(+), 67 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 7c305f5cca83d6..2ca045d14a6ada 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -625,8 +625,8 @@ class LoopAccessInfo {
 
   /// Returns all blocks with a countable exit, i.e. the exit-not-taken count
   /// is known exactly at compile time.
-  const SmallVector<BasicBlock *, 4> &getCountableEarlyExitingBlocks() const {
-    return CountableEarlyExitingBlocks;
+  const SmallVector<BasicBlock *, 4> &getCountableExitingBlocks() const {
+    return CountableExitingBlocks;
   }
 
   /// The diagnostics report generated for the analysis.  E.g. why we
@@ -729,9 +729,9 @@ class LoopAccessInfo {
   /// Keeps track of the successor of the early-exiting block, if present.
   BasicBlock *SpeculativeEarlyExitBB = nullptr;
 
-  /// Keeps track of all the early exits with known or countable exit-not-taken
+  /// Keeps track of all the exits with known or countable exit-not-taken
   /// counts.
-  SmallVector<BasicBlock *, 4> CountableEarlyExitingBlocks;
+  SmallVector<BasicBlock *, 4> CountableExitingBlocks;
 
   /// Indicator that there are non vectorizable stores to a uniform address.
   bool HasDependenceInvolvingLoopInvariantAddress = false;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index bf21d494fec7d2..225019eed3a7d4 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1296,8 +1296,29 @@ bool AccessAnalysis::mayFault() {
     // For now only the simplest cases are permitted, but this could be
     // extended further.
     auto *GEP = dyn_cast<GetElementPtrInst>(UO.first);
-    if (!GEP || GEP->getPointerOperand() != UO.second[0] ||
-        GEP->getNumIndices() != 1)
+    if (!GEP || GEP->getPointerOperand() != UO.second[0])
+      return true;
+
+    // The only current supported case for 2 GEP indices is when accessing an
+    // array, i.e.
+    //   getelementptr [32 x i32], ptr %arr, i64 0, i64 %ind
+    Value *GEPInd;
+    Type *GEPElemType;
+    if (GEP->getNumIndices() == 2) {
+      auto *ArrayTy = dyn_cast<ArrayType>(GEP->getSourceElementType());
+      if (!ArrayTy || !isa<ConstantInt>(GEP->getOperand(1)) ||
+          !cast<ConstantInt>(GEP->getOperand(1))->isZero())
+        return true;
+      GEPInd = GEP->getOperand(2);
+      GEPElemType = ArrayTy->getElementType();
+    } else if (GEP->getNumIndices() == 1) {
+      GEPInd = GEP->getOperand(1);
+      GEPElemType = GEP->getSourceElementType();
+    } else
+      return true;
+
+    // We don't handle scalable GEP element types.
+    if (GEPElemType->getPrimitiveSizeInBits().isScalable())
       return true;
 
     // Verify pointer accessed within the loop always falls within the bounds
@@ -1320,13 +1341,12 @@ bool AccessAnalysis::mayFault() {
     if (!ObjSize)
       return true;
 
-    Value *GEPInd = GEP->getOperand(1);
     const SCEV *IndScev = PSE.getSCEV(GEPInd);
     if (!isa<SCEVAddRecExpr>(IndScev))
       return true;
 
     // Calculate the maximum number of addressable elements in the object.
-    uint64_t ElemSize = GEP->getSourceElementType()->getScalarSizeInBits() / 8;
+    uint64_t ElemSize = GEPElemType->getScalarSizeInBits() / 8;
     uint64_t MaxNumElems = ObjSize / ElemSize;
 
     const SCEV *MinScev = PSE.getSE()->getConstant(GEPInd->getType(), 0);
@@ -2462,7 +2482,7 @@ bool LoopAccessInfo::isAnalyzableEarlyExitLoop() {
     }
   assert(SpeculativeEarlyExitBB &&
          "Expected to find speculative early exit block");
-  CountableEarlyExitingBlocks = std::move(CountableExitingBBs);
+  CountableExitingBlocks = std::move(CountableExitingBBs);
 
   return true;
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a33fb94ae7d75a..b8fb84112af5b0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -181,6 +181,10 @@ static cl::opt<bool> EnableEpilogueVectorization(
     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
     cl::desc("Enable vectorization of epilogue loops."));
 
+static cl::opt<bool> EnableEarlyExitVectorization(
+    "enable-early-exit-vectorization", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization of early exit loops."));
+
 static cl::opt<bool> AssumeNoMemFault(
     "vectorizer-no-mem-fault", cl::init(false), cl::Hidden,
     cl::desc("Assume vectorized loops will not have memory faults, which is "
@@ -1490,7 +1494,7 @@ class LoopVectorizationCostModel {
     // the exits.
     if (Legal->hasSpeculativeEarlyExit()) {
       const SmallVector<BasicBlock *, 4> &CountableExitingBlocks =
-          Legal->getLAI()->getCountableEarlyExitingBlocks();
+          Legal->getLAI()->getCountableExitingBlocks();
       if (CountableExitingBlocks.size() > 1 ||
           (CountableExitingBlocks.size() == 1 &&
            CountableExitingBlocks[0] != TheLoop->getLoopLatch()))
@@ -3609,10 +3613,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
 
   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
   // in the exit block, so update the builder.
-  BasicBlock *OrigEarlyExitBB = Legal->hasSpeculativeEarlyExit()
-                                    ? Legal->getSpeculativeEarlyExitBlock()
-                                    : nullptr;
-
   State.Builder.SetInsertPoint(State.CFG.ExitBB,
                                State.CFG.ExitBB->getFirstNonPHIIt());
   for (const auto &KV : Plan.getLiveOuts())
@@ -9928,15 +9928,74 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
   }
 }
 
-static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
-                                       VectorizationFactor &VF,
-                                       std::optional<unsigned> VScale, Loop *L,
-                                       ScalarEvolution &SE,
-                                       ScalarEpilogueLowering SEL) {
+static InstructionCost calculateEarlyExitCost(const TargetTransformInfo *TTI,
+                                              LoopVectorizationLegality *Legal,
+                                              Loop *L, ElementCount VF) {
+  unsigned NumCttzElemCalls = 0;
+  BasicBlock *OrigEarlyExitingBlock = Legal->getSpeculativeEarlyExitingBlock();
+  BasicBlock *OrigLoopLatch = L->getLoopLatch();
+  for (const auto &Entry : Legal->getInductionVars()) {
+    PHINode *OrigPhi = Entry.first;
+    Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoopLatch);
+
+    for (User *U : PostInc->users()) {
+      // This assumes if it's not in the loop then it must be the normal
+      // exit block. However, it could be a user in an early exit block
+      // different to the latch's exit block.
+      auto *UI = cast<Instruction>(U);
+      if (!L->contains(UI)) {
+        PHINode *PHI = dyn_cast<PHINode>(UI);
+        assert(PHI && "Expected LCSSA form");
+        int Index = PHI->getBasicBlockIndex(OrigEarlyExitingBlock);
+        if (Index != -1 && PHI->getIncomingValue(Index) == PostInc)
+          NumCttzElemCalls++;
+      }
+    }
+
+    for (User *U : OrigPhi->users()) {
+      auto *UI = cast<Instruction>(U);
+      if (!L->contains(UI)) {
+        PHINode *PHI = dyn_cast<PHINode>(UI);
+        assert(PHI && "Expected LCSSA form");
+        int Index = PHI->getBasicBlockIndex(OrigEarlyExitingBlock);
+        if (Index != -1 && PHI->getIncomingValue(Index) == OrigPhi)
+          NumCttzElemCalls++;
+      }
+    }
+  }
+
+  InstructionCost Cost = 0;
+  if (NumCttzElemCalls) {
+    LLVMContext &Context = L->getHeader()->getContext();
+    // Ideally we'd query the vplan for the canonical IV type, but we don't
+    // have a vplan yet so let's assume it's 64-bit.
+    auto CtzType = IntegerType::getIntNTy(Context, 64);
+    auto VecI1Type = VectorType::get(IntegerType::getInt1Ty(Context), VF);
+
+    IntrinsicCostAttributes Attrs(
+        Intrinsic::experimental_cttz_elts, CtzType,
+        {PoisonValue::get(VecI1Type), ConstantInt::getTrue(Context)});
+    Cost = TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_RecipThroughput);
+    Cost *= NumCttzElemCalls;
+  }
+  return Cost;
+}
+
+static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
+                                        VectorizationFactor &VF,
+                                        std::optional<unsigned> VScale, Loop *L,
+                                        ScalarEvolution &SE,
+                                        ScalarEpilogueLowering SEL,
+                                        InstructionCost EarlyExitCost) {
   InstructionCost CheckCost = Checks.getCost();
   if (!CheckCost.isValid())
     return false;
 
+  // Add on the cost of work required in the vector early exit block, if one
+  // exists.
+  if (EarlyExitCost.isValid())
+    CheckCost += EarlyExitCost;
+
   // When interleaving only scalar and vector cost will be equal, which in turn
   // would lead to a divide by 0. Fall back to hard threshold.
   if (VF.Width.isScalar()) {
@@ -10090,14 +10149,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   }
 
   // We may not have LAI at this point.
-  if (LVL.hasSpeculativeEarlyExit() && LVL.getLAI()->mayFault()) {
-    if (!AssumeNoMemFault) {
-      LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot vectorize faulting "
-                        << "loop with early exit.\n");
+  if (LVL.hasSpeculativeEarlyExit()) {
+    if (!EnableEarlyExitVectorization) {
+      LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Auto-vectorization of early "
+                        << "exit loops is disabled.\n");
       return false;
     }
-    LLVM_DEBUG(dbgs() << "LV: Assuming early exit vector loop will not "
-                      << "fault\n");
+    if (LVL.getLAI()->mayFault()) {
+      if (!AssumeNoMemFault) {
+        LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot vectorize faulting "
+                          << "loop with early exit.\n");
+        return false;
+      }
+      LLVM_DEBUG(dbgs() << "LV: Assuming early exit vector loop will not "
+                        << "fault\n");
+    }
   }
 
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
@@ -10239,12 +10305,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     if (VF.Width.isVector() || SelectedIC > 1)
       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
 
+    InstructionCost EarlyExitCost = InstructionCost::getInvalid();
+    if (VF.Width.isVector() && LVL.hasSpeculativeEarlyExit())
+      EarlyExitCost = calculateEarlyExitCost(TTI, &LVL, L, VF.Width);
+
     // Check if it is profitable to vectorize with runtime checks.
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     if (!ForceVectorization &&
-        !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
-                                    *PSE.getSE(), SEL)) {
+        !isOutsideLoopWorkProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
+                                     *PSE.getSE(), SEL, EarlyExitCost)) {
       ORE->emit([&]() {
         return OptimizationRemarkAnalysisAliasing(
                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index ded9ec6153b82d..8d1d999140f6da 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S < %s -p loop-vectorize -mattr=+sve -mtriple aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,MAY_FAULT
-; RUN: opt -S < %s -p loop-vectorize -vectorizer-no-mem-fault -mattr=+sve -mtriple aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,NO_FAULT
+; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -mattr=+sve \
+; RUN:    -mtriple aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,MAY_FAULT
+; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -vectorizer-no-mem-fault \
+; RUN:    -mattr=+sve -mtriple aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,NO_FAULT
 
 
 define i64 @same_exit_block_pre_inc_use1() {
@@ -113,6 +115,116 @@ loop.end:
 }
 
 
+define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_gep_two_indices(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 40, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 40, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 40, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 3, [[N_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[LOOP_INC3:%.*]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[OFFSET_IDX]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i64> [[DOTSPLATINSERT]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 16 x i64> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 16 x i64> [[TMP7]], shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 16 x i64> [[DOTSPLAT]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 7
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 11
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 13
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 15
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP31:%.*]] = xor <vscale x 16 x i1> [[TMP30]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP32:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP31]])
+; CHECK-NEXT:    br i1 [[TMP32]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC3]]
+; CHECK:       loop.inc3:
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP34:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP34]], [[INDEX1]]
+; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP35]]
+; CHECK-NEXT:    br label [[LOOP_END:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 40, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 43, [[LOOP_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds [1024 x i8], ptr %p1, i64 0, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds [1024 x i8], ptr %p2, i64 0, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 43
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 43, %loop.inc ]
+  ret i64 %retval
+}
+
+
 define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
 ; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_alloca_diff_type(
 ; CHECK-SAME: ) #[[ATTR0]] {
@@ -170,7 +282,7 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
 ; CHECK:       loop.inc3:
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
 ; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX1]]
@@ -193,7 +305,7 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       loop.end:
 ; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -278,7 +390,7 @@ define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p
 ; CHECK:       loop.inc3:
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
 ; CHECK-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
@@ -301,7 +413,7 @@ define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       loop.end:
 ; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -386,7 +498,7 @@ define i64 @same_exit_block_pre_inc_use2() {
 ; CHECK:       loop.inc3:
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       middle.block:
@@ -408,7 +520,7 @@ define i64 @same_exit_block_pre_inc_use2() {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       loop.end:
 ; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ 43, [[LAND_RHS]] ], [ [[INDEX]], [[FOR_INC]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ], [ 43, [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -494,7 +606,7 @@ define i64 @same_exit_block_pre_inc_use3() {
 ; CHECK:       loop.inc3:
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
 ; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX1]]
@@ -524,7 +636,7 @@ define i64 @same_exit_block_pre_inc_use3() {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       loop.end:
 ; CHECK-NEXT:    [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[FOR_INC]] ], [ [[INDEX]], [[LAND_RHS]] ], [ [[IND_EARLY_ESCAPE5]], [[VECTOR_EARLY_EXIT]] ], [ [[IND_ESCAPE7]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[INDEX_LCSSA]]
@@ -606,7 +718,7 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT3]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP23]], i1 true)
 ; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[TMP36]], [[INDEX1]]
@@ -627,7 +739,7 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       loop.end:
 ; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -714,7 +826,7 @@ define i64 @same_exit_block_post_inc_use() {
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP12]], 1
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
 ; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[INDEX1]]
@@ -737,7 +849,7 @@ define i64 @same_exit_block_post_inc_use() {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       loop.end:
 ; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ [[INDEX_NEXT]], [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -824,7 +936,7 @@ define i64 @same_exit_block_post_inc_use2() {
 ; CHECK:       loop.inc3:
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP34]], i1 true)
 ; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[INDEX1]]
@@ -850,7 +962,7 @@ define i64 @same_exit_block_post_inc_use2() {
 ; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT]]
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       loop.end:
 ; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LAND_RHS]] ], [ [[INDEX]], [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -915,7 +1027,7 @@ define i64 @same_exit_block_phi_of_consts() {
 ; CHECK:       loop.inc3:
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       middle.block:
@@ -935,7 +1047,7 @@ define i64 @same_exit_block_phi_of_consts() {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       loop.end:
 ; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ 0, [[LAND_RHS]] ], [ 1, [[FOR_INC]] ], [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -1022,7 +1134,7 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; CHECK:       loop.inc3:
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
 ; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX1]]
@@ -1045,7 +1157,7 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       loop.early.exit:
 ; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -1138,7 +1250,7 @@ define i64 @diff_exit_block_pre_inc_use2() {
 ; CHECK:       loop.inc3:
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    br label [[FOR_EARLY_LOOPEXIT:%.*]]
 ; CHECK:       middle.block:
@@ -1160,7 +1272,7 @@ define i64 @diff_exit_block_pre_inc_use2() {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       loop.early.exit:
 ; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ 43, [[LAND_RHS]] ], [ 43, [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -1253,7 +1365,7 @@ define i64 @diff_exit_block_pre_inc_use3() {
 ; CHECK:       loop.inc4:
 ; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX2]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
 ; CHECK-NEXT:    [[TMP37:%.*]] = add i64 [[TMP36]], [[INDEX2]]
@@ -1278,7 +1390,7 @@ define i64 @diff_exit_block_pre_inc_use3() {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       loop.early.exit:
 ; CHECK-NEXT:    [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[INDEX_LCSSA]]
@@ -1349,7 +1461,7 @@ define i64 @diff_exit_block_phi_of_consts() {
 ; CHECK:       loop.inc3:
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    br label [[FOR_EARLY_LOOPEXIT:%.*]]
 ; CHECK:       middle.block:
@@ -1369,7 +1481,7 @@ define i64 @diff_exit_block_phi_of_consts() {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       loop.early.exit:
 ; CHECK-NEXT:    ret i64 0
 ; CHECK:       loop.end:
@@ -1460,7 +1572,7 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP12]], 1
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP33]], i1 true)
 ; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[INDEX1]]
@@ -1483,7 +1595,7 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP31:![0-9]+]]
 ; CHECK:       loop.early.exit:
 ; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -1578,7 +1690,7 @@ define i64 @diff_exit_block_post_inc_use2() {
 ; CHECK:       loop.inc3:
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP34]], i1 true)
 ; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[TMP37]], [[INDEX1]]
@@ -1604,7 +1716,7 @@ define i64 @diff_exit_block_post_inc_use2() {
 ; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_EARLY_LOOPEXIT]]
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP33:![0-9]+]]
 ; CHECK:       loop.early.exit:
 ; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LAND_RHS]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -1751,7 +1863,7 @@ define i64 @multiple_exits_one_early() {
 ; CHECK:       loop.inc3:
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP36]], i1 true)
 ; CHECK-NEXT:    [[TMP40:%.*]] = add i64 [[TMP39]], [[INDEX1]]
@@ -1776,7 +1888,7 @@ define i64 @multiple_exits_one_early() {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP33:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP35:![0-9]+]]
 ; CHECK:       loop.end:
 ; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ 64, [[LAND_RHS]] ], [ [[INDEX]], [[SEARCH]] ], [ 128, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -1970,7 +2082,7 @@ define i64 @same_exit_block_pre_inc_use_inv_cond(i1 %cond) {
 ; CHECK:       loop.inc2:
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP37:%.*]] = add i64 0, [[INDEX1]]
 ; CHECK-NEXT:    [[IND_EARLY_ESCAPE:%.*]] = add i64 3, [[TMP37]]
@@ -1992,7 +2104,7 @@ define i64 @same_exit_block_pre_inc_use_inv_cond(i1 %cond) {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP35:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP37:![0-9]+]]
 ; CHECK:       loop.end:
 ; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -2158,7 +2270,7 @@ define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
 ; NO_FAULT:       loop.inc3:
 ; NO_FAULT-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
 ; NO_FAULT-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; NO_FAULT-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; NO_FAULT-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; NO_FAULT:       vector.early.exit:
 ; NO_FAULT-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
 ; NO_FAULT-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
@@ -2181,7 +2293,7 @@ define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
 ; NO_FAULT:       loop.inc:
 ; NO_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; NO_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; NO_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP37:![0-9]+]]
+; NO_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP39:![0-9]+]]
 ; NO_FAULT:       loop.end:
 ; NO_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
 ; NO_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -2286,7 +2398,7 @@ define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceabl
 ; NO_FAULT:       loop.inc3:
 ; NO_FAULT-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
 ; NO_FAULT-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; NO_FAULT-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; NO_FAULT-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
 ; NO_FAULT:       vector.early.exit:
 ; NO_FAULT-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
 ; NO_FAULT-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
@@ -2309,7 +2421,7 @@ define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceabl
 ; NO_FAULT:       loop.inc:
 ; NO_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; NO_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; NO_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; NO_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP41:![0-9]+]]
 ; NO_FAULT:       loop.end:
 ; NO_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
 ; NO_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -2412,7 +2524,7 @@ define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) {
 ; NO_FAULT:       loop.inc3:
 ; NO_FAULT-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP6]]
 ; NO_FAULT-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
-; NO_FAULT-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; NO_FAULT-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
 ; NO_FAULT:       vector.early.exit:
 ; NO_FAULT-NEXT:    [[TMP35:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
 ; NO_FAULT-NEXT:    [[TMP36:%.*]] = add i64 [[TMP35]], [[INDEX1]]
@@ -2435,7 +2547,7 @@ define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) {
 ; NO_FAULT:       loop.inc:
 ; NO_FAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; NO_FAULT-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 43
-; NO_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP41:![0-9]+]]
+; NO_FAULT-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP43:![0-9]+]]
 ; NO_FAULT:       loop.end:
 ; NO_FAULT-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 43, [[FOR_INC]] ], [ [[IND_EARLY_ESCAPE]], [[VECTOR_EARLY_EXIT]] ], [ 43, [[MIDDLE_BLOCK]] ]
 ; NO_FAULT-NEXT:    ret i64 [[START_0_LCSSA]]
@@ -2498,6 +2610,8 @@ loop.end:
 ; MAY_FAULT: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]}
 ; MAY_FAULT: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]}
 ; MAY_FAULT: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]}
+; MAY_FAULT: [[LOOP36]] = distinct !{[[LOOP36]], [[META1]], [[META2]]}
+; MAY_FAULT: [[LOOP37]] = distinct !{[[LOOP37]], [[META2]], [[META1]]}
 ;.
 ; NO_FAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; NO_FAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -2541,4 +2655,6 @@ loop.end:
 ; NO_FAULT: [[LOOP39]] = distinct !{[[LOOP39]], [[META2]], [[META1]]}
 ; NO_FAULT: [[LOOP40]] = distinct !{[[LOOP40]], [[META1]], [[META2]]}
 ; NO_FAULT: [[LOOP41]] = distinct !{[[LOOP41]], [[META2]], [[META1]]}
+; NO_FAULT: [[LOOP42]] = distinct !{[[LOOP42]], [[META1]], [[META2]]}
+; NO_FAULT: [[LOOP43]] = distinct !{[[LOOP43]], [[META2]], [[META1]]}
 ;.