[llvm] [LV] Initial support for stores in early exit loops (PR #137774)

Graham Hunter via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 29 02:16:47 PDT 2025


https://github.com/huntergr-arm created https://github.com/llvm/llvm-project/pull/137774

Adds some basic support for a simple early exit loop with a store.

This is vectorized such that when the next vector iteration would exit, we bail out to the scalar loop to handle the exit.

The main complication faced is transforming the vplan; it feels quite fragile at the moment, and might be better done up front at initial recipe creation time. Thoughts?

An alternative would be to use tail-folding, but that requires doing brka/brkb inside the loop, which could be costly. And would still likely require moving some IR, just not copying to the preheader.

(One advantage to this approach is that architectures without predication could potentially take advantage of ee autovec, though I suspect we'd want this driven by pragmas instead of just guessing about a possible trip count)

>From a7599dcde512288b8ffd69f8dc2328fa6a505c92 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 4 Apr 2025 15:44:52 +0000
Subject: [PATCH] [LV] Initial support for stores in early exit loops

Adds some basic support for a simple early exit loop with a store.

This is vectorized such that when the next vector iteration would
exit, we bail out to the scalar loop to handle the exit.
---
 .../Vectorize/LoopVectorizationLegality.h     |  17 ++
 .../Vectorize/LoopVectorizationLegality.cpp   | 125 ++++++++++-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  21 ++
 llvm/lib/Transforms/Vectorize/VPlan.h         |  17 ++
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 205 ++++++++++++++----
 .../Transforms/Vectorize/VPlanTransforms.h    |   2 +
 .../Transforms/Vectorize/VPlanVerifier.cpp    |   3 +-
 .../AArch64/simple_early_exit.ll              |  77 +++++++
 .../Transforms/LoopVectorize/control-flow.ll  |   2 +-
 .../LoopVectorize/early_exit_legality.ll      | 188 +++++++++++++++-
 10 files changed, 599 insertions(+), 58 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index d654ac3ec9273..64efd3ca32200 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -407,6 +407,13 @@ class LoopVectorizationLegality {
     return hasUncountableEarlyExit() ? getUncountableEdge()->second : nullptr;
   }
 
+  /// Returns true if this is an early exit loop containing a store.
+  bool isConditionCopyRequired() const { return RequiresEarlyExitConditionCopy; }
+
+  /// Returns the load instruction, if any, nearest to an uncountable early
+  /// exit.
+  std::optional<LoadInst *> getEarlyExitLoad() const { return EarlyExitLoad; }
+
   /// Return true if there is store-load forwarding dependencies.
   bool isSafeForAnyStoreLoadForwardDistances() const {
     return LAI->getDepChecker().isSafeForAnyStoreLoadForwardDistances();
@@ -654,6 +661,16 @@ class LoopVectorizationLegality {
   /// Keep track of the loop edge to an uncountable exit, comprising a pair
   /// of (Exiting, Exit) blocks, if there is exactly one early exit.
   std::optional<std::pair<BasicBlock *, BasicBlock *>> UncountableEdge;
+
+  /// Indicates that we will need to copy the early exit condition into
+  /// the vector preheader, as we will need to mask some operations in
+  /// the loop (e.g. stores).
+  bool RequiresEarlyExitConditionCopy = false;
+
+  /// The load used to determine an uncountable early-exit condition. This is
+  /// only used to allow further analysis in canVectorizeMemory if we found
+  /// what looks like a valid early exit loop with store beforehand.
+  std::optional<LoadInst *> EarlyExitLoad;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 3ec6850d6f685..9b4e1751ab11d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -1209,6 +1210,36 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     });
   }
 
+  // FIXME: Remove or reduce this restriction. We're in a bit of an odd spot
+  //        since we're (potentially) doing the load out of its normal order
+  //        in the loop and that may throw off dependency checking.
+  //        A forward dependency should be fine, but a backwards dep may not
+  //        be even if LAA thinks it is due to performing the load for the
+  //        vector iteration i+1 in vector iteration i.
+  if (isConditionCopyRequired()) {
+    assert(EarlyExitLoad.has_value() && "EE Store without condition load.");
+
+    if (LAI->canVectorizeMemory()) {
+      const MemoryDepChecker &DepChecker = LAI->getDepChecker();
+      const auto *Deps = DepChecker.getDependences();
+
+      for (const MemoryDepChecker::Dependence &Dep : *Deps) {
+        if (Dep.getDestination(DepChecker) == EarlyExitLoad ||
+            Dep.getSource(DepChecker) == EarlyExitLoad) {
+          // Refine language a little? This currently only applies when a store
+          // is present in the early exit loop.
+          reportVectorizationFailure(
+            "No dependencies allowed for early exit condition load",
+            "Early exit condition loads may not have a dependence with another"
+            " memory operation.",
+            "CantVectorizeStoreToLoopInvariantAddress", ORE,
+            TheLoop);
+          return false;
+        }
+      }
+    }
+  }
+
   if (!LAI->canVectorizeMemory())
     return canVectorizeIndirectUnsafeDependences();
 
@@ -1627,6 +1658,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
   // Keep a record of all the exiting blocks.
   SmallVector<const SCEVPredicate *, 4> Predicates;
   std::optional<std::pair<BasicBlock *, BasicBlock *>> SingleUncountableEdge;
+  std::optional<LoadInst *> EELoad;
   for (BasicBlock *BB : ExitingBlocks) {
     const SCEV *EC =
         PSE.getSE()->getPredicatedExitCount(TheLoop, BB, &Predicates);
@@ -1656,6 +1688,21 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
         return false;
       }
 
+      // For loops with stores.
+      // Record load for analysis by isDereferenceableAndAlignedInLoop
+      // and later by dependence analysis.
+      if (BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator())) {
+        // FIXME: Handle exit conditions with multiple users, more complex exit
+        //        conditions than br(icmp(load, loop_inv)).
+        ICmpInst *Cmp = dyn_cast<ICmpInst>(Br->getCondition());
+        if (Cmp && Cmp->hasOneUse() &&
+            TheLoop->isLoopInvariant(Cmp->getOperand(1))) {
+          LoadInst *Load = dyn_cast<LoadInst>(Cmp->getOperand(0));
+          if (Load && Load->hasOneUse() && TheLoop->contains(Load))
+            EELoad = Load;
+        }
+      }
+
       SingleUncountableEdge = {BB, ExitBlock};
     } else
       CountableExitingBlocks.push_back(BB);
@@ -1708,16 +1755,31 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
     }
   };
 
+  bool HasStore = false;
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+        HasStore = true;
+        if (SI->isSimple())
+          continue;
+
+        reportVectorizationFailure(
+            "Complex writes to memory unsupported in early exit loops",
+            "Cannot vectorize early exit loop with complex writes to memory",
+            "WritesInEarlyExitLoop", ORE, TheLoop);
+        return false;
+      }
+
       if (I.mayWriteToMemory()) {
         // We don't support writes to memory.
         reportVectorizationFailure(
-            "Writes to memory unsupported in early exit loops",
-            "Cannot vectorize early exit loop with writes to memory",
+            "Complex writes to memory unsupported in early exit loops",
+            "Cannot vectorize early exit loop with complex writes to memory",
             "WritesInEarlyExitLoop", ORE, TheLoop);
         return false;
-      } else if (!IsSafeOperation(&I)) {
+      }
+
+      if (!IsSafeOperation(&I)) {
         reportVectorizationFailure("Early exit loop contains operations that "
                                    "cannot be speculatively executed",
                                    "UnsafeOperationsEarlyExitLoop", ORE,
@@ -1732,13 +1794,53 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
 
   // TODO: Handle loops that may fault.
   Predicates.clear();
-  if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
-                                     &Predicates)) {
+
+  if (HasStore && EELoad.has_value()) {
+    LoadInst *LI = *EELoad;
+    if (isDereferenceableAndAlignedInLoop(LI, TheLoop, *PSE.getSE(), *DT, AC,
+                                          &Predicates)) {
+      ICFLoopSafetyInfo SafetyInfo;
+      SafetyInfo.computeLoopSafetyInfo(TheLoop);
+      // FIXME: We may have multiple levels of conditional loads, so will
+      //        need to improve on outright rejection at some point.
+      if (!SafetyInfo.isGuaranteedToExecute(*LI, DT, TheLoop)) {
+        LLVM_DEBUG(
+            dbgs() << "Early exit condition load not guaranteed to execute.\n");
+        reportVectorizationFailure(
+            "Early exit condition load not guaranteed to execute",
+            "Cannot vectorize early exit loop when condition load is not "
+            "guaranteed to execute",
+            "EarlyExitLoadNotGuaranteed", ORE, TheLoop);
+      }
+    } else {
+      LLVM_DEBUG(dbgs() << "Early exit condition load potentially unsafe.\n");
+      reportVectorizationFailure("Uncounted loop condition not known safe",
+                                 "Cannot vectorize early exit loop with "
+                                 "possibly unsafe condition load",
+                                 "PotentiallyFaultingEarlyExitLoop", ORE,
+                                 TheLoop);
+      return false;
+    }
+  } else if (HasStore) {
+    LLVM_DEBUG(dbgs() << "Found early exit store but no condition load.\n");
     reportVectorizationFailure(
-        "Loop may fault",
-        "Cannot vectorize potentially faulting early exit loop",
-        "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+        "Early exit loop with store but no condition load",
+        "Cannot vectorize early exit loop with store but no condition load",
+        "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
     return false;
+  } else {
+    // Read-only loop.
+    // FIXME: as with the loops with stores, only the loads contributing to
+    //        the loop condition need to be guaranteed dereferenceable and
+    //        aligned.
+    if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
+                                       &Predicates)) {
+      reportVectorizationFailure(
+          "Loop may fault",
+          "Cannot vectorize potentially faulting early exit loop",
+          "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+      return false;
+    }
   }
 
   [[maybe_unused]] const SCEV *SymbolicMaxBTC =
@@ -1751,6 +1853,11 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
                        "backedge taken count: "
                     << *SymbolicMaxBTC << '\n');
   UncountableEdge = SingleUncountableEdge;
+  if (HasStore) {
+    RequiresEarlyExitConditionCopy = true;
+    EarlyExitLoad = EELoad;
+  }
+
   return true;
 }
 
@@ -1823,6 +1930,8 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
     } else {
       if (!isVectorizableEarlyExitLoop()) {
         UncountableEdge = std::nullopt;
+        EarlyExitLoad = std::nullopt;
+        RequiresEarlyExitConditionCopy = false;
         if (DoExtraAnalysis)
           Result = false;
         else
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4c1ed15ee700f..7dca356edca65 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9246,6 +9246,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
         VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths,
                                  *Plan, CM.getMinimalBitwidths());
       VPlanTransforms::optimize(*Plan);
+
+      // See if we can convert an early exit vplan to bail out to a scalar
+      // loop if state-changing operations (like stores) are present and
+      // an exit will be taken in the next vector iteration.
+      // If not, discard the plan.
+      if (Legal->isConditionCopyRequired() && !HasScalarVF &&
+          !VPlanTransforms::runPass(VPlanTransforms::tryEarlyExitConversion,
+                                    *Plan))
+        break;
       // TODO: try to put it close to addActiveLaneMask().
       // Discard the plan if it is not EVL-compatible
       if (CM.foldTailWithEVL() && !HasScalarVF &&
@@ -9570,6 +9579,12 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
           },
           Range);
   auto Plan = std::make_unique<VPlan>(OrigLoop);
+
+  // FIXME: Better place to put this? Or maybe an enum for how to handle
+  //        early exits?
+  if (Legal->hasUncountableEarlyExit())
+    Plan->setEarlyExitContinuesInScalarLoop(Legal->isConditionCopyRequired());
+
   // Build hierarchical CFG.
   // TODO: Convert to VPlan-transform and consolidate all transforms for VPlan
   // creation.
@@ -9876,6 +9891,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
 
   // Create new empty VPlan
   auto Plan = std::make_unique<VPlan>(OrigLoop);
+
+  // FIXME: Better place to put this? Or maybe an enum for how to handle
+  //        early exits?
+  if (Legal->hasUncountableEarlyExit())
+    Plan->setEarlyExitContinuesInScalarLoop(Legal->isConditionCopyRequired());
+
   // Build hierarchical CFG
   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
   HCFGBuilder.buildPlainCFG();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 7084676af6d5b..5320290049b42 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3522,6 +3522,13 @@ class VPlan {
   /// VPlan is destroyed.
   SmallVector<VPBlockBase *> CreatedBlocks;
 
+  /// Indicates that an early exit loop will exit before the condition is
+  /// reached, and that the scalar loop must perform the last few iterations.
+  /// FIXME: Is this the right place? We mainly want to make sure that we
+  ///        know about this for transforming the plan to copy&move the exit
+  ///        condition, but maybe it doesn't need to be in the plan itself.
+  bool EarlyExitContinuesInScalarLoop = false;
+
   /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader
   /// wrapping the original header of the scalar loop.
   VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader)
@@ -3825,6 +3832,16 @@ class VPlan {
     return ExitBlocks.size() > 1 || ExitBlocks[0]->getNumPredecessors() > 1;
   }
 
+  /// Returns true if all exit paths should reach the scalar loop.
+  bool shouldEarlyExitContinueInScalarLoop() const {
+    return EarlyExitContinuesInScalarLoop;
+  }
+
+  /// Set early exit vectorization to always reach the scalar loop.
+  void setEarlyExitContinuesInScalarLoop(bool Continues) {
+    EarlyExitContinuesInScalarLoop = Continues;
+  }
+
   /// Returns true if the scalar tail may execute after the vector loop. Note
   /// that this relies on unneeded branches to the scalar tail loop being
   /// removed.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9d8f1706cf61b..aef3d9b728ea5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2469,61 +2469,63 @@ void VPlanTransforms::handleUncountableEarlyExit(
   // block and have it conditionally branch to the early exit block if
   // EarlyExitTaken.
   auto *EarlyExitingBranch =
-      cast<BranchInst>(UncountableExitingBlock->getTerminator());
+  cast<BranchInst>(UncountableExitingBlock->getTerminator());
   BasicBlock *TrueSucc = EarlyExitingBranch->getSuccessor(0);
   BasicBlock *FalseSucc = EarlyExitingBranch->getSuccessor(1);
   BasicBlock *EarlyExitIRBB =
-      !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc;
+  !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc;
   VPIRBasicBlock *VPEarlyExitBlock = Plan.getExitBlock(EarlyExitIRBB);
-
+  
   VPValue *EarlyExitNotTakenCond = RecipeBuilder.getBlockInMask(
-      OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
-  auto *EarlyExitTakenCond = Builder.createNot(EarlyExitNotTakenCond);
-  IsEarlyExitTaken =
-      Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond});
-
-  VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
-  VPBasicBlock *VectorEarlyExitVPBB =
-      Plan.createVPBasicBlock("vector.early.exit");
-  VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle);
-  VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
-  NewMiddle->swapSuccessors();
-
-  VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, VPEarlyExitBlock);
-
-  // Update the exit phis in the early exit block.
-  VPBuilder MiddleBuilder(NewMiddle);
-  VPBuilder EarlyExitB(VectorEarlyExitVPBB);
-  for (VPRecipeBase &R : *VPEarlyExitBlock) {
-    auto *ExitIRI = dyn_cast<VPIRPhi>(&R);
-    if (!ExitIRI)
-      break;
+    OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc);
+    auto *EarlyExitTakenCond = Builder.createNot(EarlyExitNotTakenCond);
+    IsEarlyExitTaken =
+    Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond});
+
+  if (!Plan.shouldEarlyExitContinueInScalarLoop()) {
+    VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
+    VPBasicBlock *VectorEarlyExitVPBB =
+        Plan.createVPBasicBlock("vector.early.exit");
+    VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle);
+    VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB);
+    NewMiddle->swapSuccessors();
+
+    VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, VPEarlyExitBlock);
+
+    // Update the exit phis in the early exit block.
+    VPBuilder MiddleBuilder(NewMiddle);
+    VPBuilder EarlyExitB(VectorEarlyExitVPBB);
+    for (VPRecipeBase &R : *VPEarlyExitBlock) {
+      auto *ExitIRI = dyn_cast<VPIRPhi>(&R);
+      if (!ExitIRI)
+        break;
 
-    PHINode &ExitPhi = ExitIRI->getIRPhi();
-    VPValue *IncomingFromEarlyExit = RecipeBuilder.getVPValueOrAddLiveIn(
-        ExitPhi.getIncomingValueForBlock(UncountableExitingBlock));
-
-    if (OrigLoop->getUniqueExitBlock()) {
-      // If there's a unique exit block, VPEarlyExitBlock has 2 predecessors
-      // (MiddleVPBB and NewMiddle). Add the incoming value from MiddleVPBB
-      // which is coming from the original latch.
-      VPValue *IncomingFromLatch = RecipeBuilder.getVPValueOrAddLiveIn(
-          ExitPhi.getIncomingValueForBlock(OrigLoop->getLoopLatch()));
-      ExitIRI->addOperand(IncomingFromLatch);
-      ExitIRI->extractLastLaneOfOperand(MiddleBuilder);
-    }
-    // Add the incoming value from the early exit.
-    if (!IncomingFromEarlyExit->isLiveIn() && !Plan.hasScalarVFOnly()) {
-      VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
-          VPInstruction::FirstActiveLane, {EarlyExitTakenCond}, nullptr,
-          "first.active.lane");
-      IncomingFromEarlyExit = EarlyExitB.createNaryOp(
-          Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane},
-          nullptr, "early.exit.value");
+      PHINode &ExitPhi = ExitIRI->getIRPhi();
+      VPValue *IncomingFromEarlyExit = RecipeBuilder.getVPValueOrAddLiveIn(
+          ExitPhi.getIncomingValueForBlock(UncountableExitingBlock));
+
+      if (OrigLoop->getUniqueExitBlock()) {
+        // If there's a unique exit block, VPEarlyExitBlock has 2 predecessors
+        // (MiddleVPBB and NewMiddle). Add the incoming value from MiddleVPBB
+        // which is coming from the original latch.
+        VPValue *IncomingFromLatch = RecipeBuilder.getVPValueOrAddLiveIn(
+            ExitPhi.getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+        ExitIRI->addOperand(IncomingFromLatch);
+        ExitIRI->extractLastLaneOfOperand(MiddleBuilder);
+      }
+      // Add the incoming value from the early exit.
+      if (!IncomingFromEarlyExit->isLiveIn() && !Plan.hasScalarVFOnly()) {
+        VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
+            VPInstruction::FirstActiveLane, {EarlyExitTakenCond}, nullptr,
+            "first.active.lane");
+        IncomingFromEarlyExit = EarlyExitB.createNaryOp(
+            Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane},
+            nullptr, "early.exit.value");
+      }
+      ExitIRI->addOperand(IncomingFromEarlyExit);
     }
-    ExitIRI->addOperand(IncomingFromEarlyExit);
+    MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
   }
-  MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
 
   // Replace the condition controlling the non-early exit from the vector loop
   // with one exiting if either the original condition of the vector latch is
@@ -2538,6 +2540,115 @@ void VPlanTransforms::handleUncountableEarlyExit(
       Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
   Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
   LatchExitingBranch->eraseFromParent();
+
+  // If the counted exit wasn't taken, continue in the scalar loop.
+  if (Plan.shouldEarlyExitContinueInScalarLoop()) {
+    VPRecipeBase *OldTerminator = MiddleVPBB->getTerminator();
+    VPBuilder Builder(OldTerminator);
+    Builder.createNaryOp(VPInstruction::BranchOnCond, IsLatchExitTaken);
+    OldTerminator->eraseFromParent();
+  }
+}
+
+bool VPlanTransforms::tryEarlyExitConversion(VPlan &Plan) {
+  // We can abandon a vplan entirely if we return false here, so we shouldn't
+  // crash if some earlier assumptions on scalar IR don't hold for the vplan
+  // version of the loop.
+  if (Plan.hasScalarVFOnly())
+    return false;
+  auto *Region = Plan.getVectorLoopRegion();
+  auto *Branch = dyn_cast<VPInstruction>(Region->getExitingBasicBlock()->getTerminator());
+  if (Branch->getOpcode() != VPInstruction::BranchOnCond)
+    return false;
+
+  // Extract the IR needed to create the uncountable exit condition.
+  // Looking for br(or(any_of(icmp(load(gep(base, iv)), loop_inv)), counted)
+  // FIXME: Build a list of nodes to copy below instead of matching
+  //        the exact pattern.
+  // FIXME: We should be able to handle multiple users for at least some of
+  //        these nodes; requires creating phis.
+  // FIXME: This does feel a bit fragile; is it better to do this earlier
+  //        when creating the initial recipe based on the scalar IR, instead
+  //        of the vplan equivalent here?
+  // FIXME: New vplan pattern matchers; m_Load, m_ICmp, m_OneUse, etc.
+  VPInstruction *Or = dyn_cast<VPInstruction>(Branch->getOperand(0));
+  if (!Or || Or->Users.size() != 1 || Or->getOpcode() != Instruction::Or)
+    return false;
+  auto *AnyOf = dyn_cast<VPInstruction>(Or->getOperand(0));
+  if (!AnyOf || AnyOf->Users.size() != 1 ||
+      AnyOf->getOpcode() != VPInstruction::AnyOf)
+    return false;
+  auto *Cmp = dyn_cast<VPWidenRecipe>(AnyOf->getOperand(0));
+  if (!Cmp || Cmp->Users.size() != 1 || Cmp->getOpcode() != Instruction::ICmp ||
+      !Cmp->getOperand(1)->isDefinedOutsideLoopRegions())
+    return false;
+  auto *Load = dyn_cast<VPWidenLoadRecipe>(Cmp->getOperand(0));
+  if (!Load || Load->Users.size() != 1)
+    return false;
+  auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Load->getAddr());
+  if (!VecPtr || VecPtr->Users.size() != 1)
+    return false;
+  auto *GEP = dyn_cast<VPReplicateRecipe>(VecPtr->getOperand(0));
+  if (!GEP || GEP->Users.size() != 1 || GEP->getNumOperands() != 2 ||
+      GEP->getOpcode() != Instruction::GetElementPtr)
+    return false;
+  auto *Base = GEP->getOperand(0);
+  if (!Base->isDefinedOutsideLoopRegions())
+    return false;
+  auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(GEP->getOperand(1));
+  if (!Steps)
+    return false;
+  auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(Steps->getOperand(0));
+  if (!IV)
+    return false;
+  VPInstruction *IVUpdate = cast<VPInstruction>(IV->getBackedgeValue());
+
+  // Duplicate exit IR and use the starting value for the IV phi.
+  // FIXME: Is making new nodes better than cloning?
+  auto *VectorPH = Plan.getVectorPreheader();
+  VPBuilder PHBuilder(VectorPH, VectorPH->getFirstNonPhi());
+  VPReplicateRecipe *PHGEP = GEP->clone();
+  PHGEP->setOperand(1, IV->getStartValue());
+  PHBuilder.insert(PHGEP);
+  auto *PHVecPtr = VecPtr->clone();
+  PHVecPtr->setOperand(0, PHGEP);
+  PHBuilder.insert(PHVecPtr);
+  VPWidenLoadRecipe *PHLoad = Load->clone();
+  PHLoad->setOperand(0, PHVecPtr);
+  PHBuilder.insert(PHLoad);
+  VPWidenRecipe *PHCmp = Cmp->clone();
+  PHCmp->setOperand(0, PHLoad);
+  PHBuilder.insert(PHCmp);
+
+  // Split vector preheader to form a new bypass block.
+  VPBasicBlock *NewPH = VectorPH->splitAt(PHBuilder.getInsertPoint());
+  VPBasicBlock *ScalarPH = Plan.getScalarPreheader();
+  VPValue *PHAnyOf = PHBuilder.createNaryOp(VPInstruction::AnyOf, {PHCmp});
+  PHBuilder.createNaryOp(VPInstruction::BranchOnCond, {PHAnyOf},
+                         PHCmp->getDebugLoc());
+  VectorPH->clearSuccessors();
+  VectorPH->setTwoSuccessors(ScalarPH, NewPH);
+
+  // Fix up the resume phi in scalar preheader -- we might not have reached
+  // the calculated maximum vector tripcount, so just use the next value of IV.
+  // FIXME: Can we rely on the resume phi being first?
+  auto *ResumePHI = cast<VPInstruction>(&ScalarPH->front());
+  VPBasicBlock *MiddleBlock = Plan.getMiddleBlock();
+  ScalarPH->clearPredecessors();
+  ScalarPH->setPredecessors({MiddleBlock, VectorPH});
+  ResumePHI->addOperand(ResumePHI->getOperand(1));
+  ResumePHI->setOperand(0, IVUpdate);
+
+  // FIXME: May be better to move the IVUpdate before the GEP instead?
+  // FIXME: No domtree available at the recipe level, just for vpbasicblocks.
+  //        Can we find iterator distances in the same block?
+  GEP->moveAfter(IVUpdate);
+  GEP->setOperand(1, IVUpdate);
+  VecPtr->moveAfter(GEP);
+  Load->moveAfter(VecPtr);
+  Cmp->moveAfter(Load);
+
+  return true;
 }
 
 void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index a9461b261ddb6..d28dc9fe30fa6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -152,6 +152,8 @@ struct VPlanTransforms {
   tryAddExplicitVectorLength(VPlan &Plan,
                              const std::optional<unsigned> &MaxEVLSafeElements);
 
+  static bool tryEarlyExitConversion(VPlan &Plan);
+
   // For each Interleave Group in \p InterleaveGroups replace the Recipes
   // widening its memory instructions with a single VPInterleaveRecipe at its
   // insertion point.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 3f10b1756d7a1..0e59530bf21d9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -212,7 +212,8 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
             isa<VPHeaderPHIRecipe, VPWidenPHIRecipe, VPPredInstPHIRecipe,
                 VPIRPhi>(UI) ||
             (isa<VPInstruction>(UI) &&
-             cast<VPInstruction>(UI)->getOpcode() == Instruction::PHI))
+             (cast<VPInstruction>(UI)->getOpcode() == Instruction::PHI ||
+              cast<VPInstruction>(UI)->getOpcode() == VPInstruction::ResumePhi)))
           continue;
 
         // If the user is in the same block, check it comes after R in the
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index 7d5b73477f6ed..85ab08b91abd5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -580,6 +580,81 @@ loop.end:
   ret i64 %retval
 }
 
+define void @loop_contains_store_single_user(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
+; CHECK-LABEL: define void @loop_contains_store_single_user(
+; CHECK-SAME: ptr noalias dereferenceable(40) [[ARRAY:%.*]], ptr readonly align 2 dereferenceable(40) [[PRED:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i16, ptr [[PRED]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i16> [[WIDE_LOAD]], splat (i16 500)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH]], label [[VECTOR_PH_SPLIT:%.*]]
+; CHECK:       vector.ph.split:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_SPLIT]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i16, ptr [[ARRAY]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i16> [[WIDE_LOAD1]], splat (i16 1)
+; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[PRED]], i64 [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <4 x i16> [[WIDE_LOAD2]], splat (i16 500)
+; CHECK-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 [[TMP11]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_PH]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[ST_ADDR:%.*]] = getelementptr inbounds nuw i16, ptr [[ARRAY]], i64 [[IV]]
+; CHECK-NEXT:    [[DATA:%.*]] = load i16, ptr [[ST_ADDR]], align 2
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i16 [[DATA]], 1
+; CHECK-NEXT:    store i16 [[INC]], ptr [[ST_ADDR]], align 2
+; CHECK-NEXT:    [[EE_ADDR:%.*]] = getelementptr inbounds nuw i16, ptr [[PRED]], i64 [[IV]]
+; CHECK-NEXT:    [[EE_VAL:%.*]] = load i16, ptr [[EE_ADDR]], align 2
+; CHECK-NEXT:    [[EE_COND:%.*]] = icmp sgt i16 [[EE_VAL]], 500
+; CHECK-NEXT:    br i1 [[EE_COND]], label [[EXIT]], label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[COUNTED_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 20
+; CHECK-NEXT:    br i1 [[COUNTED_COND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
 declare i32 @foo(i32) readonly
 declare <vscale x 4 x i32> @foo_vec(<vscale x 4 x i32>)
 
@@ -600,4 +675,6 @@ attributes #1 = { "target-features"="+sve" vscale_range(1,16) }
 ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/control-flow.ll b/llvm/test/Transforms/LoopVectorize/control-flow.ll
index 3a8aec34dfe43..2578260fe878d 100644
--- a/llvm/test/Transforms/LoopVectorize/control-flow.ll
+++ b/llvm/test/Transforms/LoopVectorize/control-flow.ll
@@ -10,7 +10,7 @@
 ;   return 0;
 ; }
 
-; CHECK: remark: source.cpp:5:9: loop not vectorized: Cannot vectorize early exit loop with writes to memory
+; CHECK: remark: source.cpp:5:9: loop not vectorized: Cannot vectorize early exit loop with possibly unsafe condition load
 ; CHECK: remark: source.cpp:5:9: loop not vectorized
 
 ; CHECK: _Z4testPii
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
index de455c81d363e..6eb9fc2adeb70 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
@@ -445,7 +445,7 @@ loop.end:
 
 define i64 @loop_contains_store(ptr %dest) {
 ; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store'
-; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops
+; CHECK:       LV: Not vectorizing: Early exit loop with store but no condition load.
 entry:
   %p1 = alloca [1024 x i8]
   call void @init_mem(ptr %p1, i64 1024)
@@ -470,6 +470,192 @@ loop.end:
   ret i64 %retval
 }
 
+define void @loop_contains_store_single_user(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_single_user'
+; CHECK:       LV: We can vectorize this loop!
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_multi_user(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_multi_user'
+; CHECK:       LV: Not vectorizing: Early exit loop with store but no condition load.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  %unused = add i16 %ee.val, 42
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_fcmp(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_fcmp'
+; CHECK:       LV: Not vectorizing: Early exit loop with store but no condition load.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw half, ptr %pred, i64 %iv
+  %ee.val = load half, ptr %ee.addr, align 2
+  %ee.cond = fcmp ugt half %ee.val, 500.0
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_safe_dependency(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(80) readonly %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_safe_dependency'
+; CHECK:       LV: Not vectorizing: No dependencies allowed for early exit condition load.
+entry:
+  %forward = getelementptr i16, ptr %pred, i64 -8
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  %some.addr = getelementptr inbounds nuw i16, ptr %forward, i64 %iv
+  store i16 42, ptr %some.addr, align 2
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_assumed_bounds(ptr noalias %array, ptr readonly %pred, i32 %n) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_assumed_bounds'
+; CHECK:       LV: Not vectorizing: Uncounted loop condition not known safe.
+entry:
+  %n_bytes = mul nuw nsw i32 %n, 2
+  call void @llvm.assume(i1 true) [ "align"(ptr %pred, i64 2), "dereferenceable"(ptr %pred, i32 %n_bytes) ]
+  %tc = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, %tc
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @loop_contains_store_volatile(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_volatile'
+; CHECK:       LV: Not vectorizing: Complex writes to memory unsupported in early exit loops.
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store volatile i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  br i1 %ee.cond, label %exit, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  br i1 %counted.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @exit_conditions_combined(ptr noalias dereferenceable(40) %array, ptr readonly align 2 dereferenceable(40) %pred) {
+; CHECK-LABEL: LV: Checking a loop in 'exit_conditions_combined'
+; CHECK:       LV: Not vectorizing: Cannot vectorize uncountable loop.
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv
+  %data = load i16, ptr %st.addr, align 2
+  %inc = add nsw i16 %data, 1
+  store i16 %inc, ptr %st.addr, align 2
+  %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv
+  %ee.val = load i16, ptr %ee.addr, align 2
+  %ee.cond = icmp sgt i16 %ee.val, 500
+  %iv.next = add nuw nsw i64 %iv, 1
+  %counted.cond = icmp eq i64 %iv.next, 20
+  %or.cond = select i1 %ee.cond, i1 true, i1 %counted.cond
+  br i1 %or.cond, label %exit, label %for.body
+
+exit:                                             ; preds = %for.body
+  ret void
+}
 
 define i64 @uncountable_exit_in_conditional_block(ptr %mask) {
 ; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_in_conditional_block'



More information about the llvm-commits mailing list