[llvm] [RFC][LV] Add support for speculative loads in loops that may fault (PR #151300)

Wed Aug 20 00:54:40 PDT 2025

https://github.com/arcbbb updated https://github.com/llvm/llvm-project/pull/151300

>From 606cb13870973d4188d13499e500f551f0ba6d71 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Tue, 19 Aug 2025 18:45:15 -0700
Subject: [PATCH 1/3] [LV] Add initial legality checks for loops with unbound
 loads.

---
 llvm/include/llvm/Analysis/Loads.h            |  6 +-
 .../llvm/Analysis/TargetTransformInfo.h       |  3 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  4 +
 .../Vectorize/LoopVectorizationLegality.h     |  8 ++
 llvm/lib/Analysis/Loads.cpp                   |  9 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  5 ++
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 12 +++
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |  4 +
 .../Target/RISCV/RISCVTargetTransformInfo.h   |  5 ++
 .../Vectorize/LoopVectorizationLegality.cpp   | 29 ++++++-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  7 ++
 .../RISCV/unbound-access-legality.ll          | 84 +++++++++++++++++++
 llvm/unittests/Analysis/LoadsTest.cpp         |  5 +-
 13 files changed, 172 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/unbound-access-legality.ll

diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 84564563de8e3..7f28afafb3500 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -86,9 +86,11 @@ LLVM_ABI bool isDereferenceableAndAlignedInLoop(
     SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
 
 /// Return true if the loop \p L cannot fault on any iteration and only
-/// contains read-only memory accesses.
-LLVM_ABI bool isDereferenceableReadOnlyLoop(
+/// contains read-only memory accesses. Also collect loads that are not
+/// guaranteed to be dereferenceable.
+LLVM_ABI bool isReadOnlyLoopWithSafeOrSpeculativeLoads(
     Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+    SmallVectorImpl<LoadInst *> *SpeculativeLoads,
     SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);
 
 /// Return true if we know that executing a load from this value cannot trap.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1e03209e888bf..8d33aaca53a00 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -843,6 +843,9 @@ class TargetTransformInfo {
   /// Return true if the target supports strided load.
   LLVM_ABI bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const;
 
+  /// Return true if the target supports speculative load.
+  LLVM_ABI bool isLegalSpeculativeLoad(Type *DataType, Align Alignment) const;
+
   /// Return true is the target supports interleaved access for the given vector
   /// type \p VTy, interleave factor \p Factor, alignment \p Alignment and
   /// address space \p AddrSpace.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 252acf381a8e1..3bcfd2b0f9308 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -374,6 +374,10 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  virtual bool isLegalSpeculativeLoad(Type *DataType, Align Alignment) const {
+    return false;
+  }
+
   virtual bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
                                             Align Alignment,
                                             unsigned AddrSpace) const {
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 48ee93acbe008..727b663288aa2 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -445,6 +445,11 @@ class LoopVectorizationLegality {
   /// Returns a list of all known histogram operations in the loop.
   bool hasHistograms() const { return !Histograms.empty(); }
 
+  /// Returns the loads that may fault and need to be speculative.
+  const SmallPtrSetImpl<const Instruction *> &getSpeculativeLoads() const {
+    return SpeculativeLoads;
+  }
+
   PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
     return &PSE;
   }
@@ -633,6 +638,9 @@ class LoopVectorizationLegality {
   /// may work on the same memory location.
   SmallVector<HistogramInfo, 1> Histograms;
 
+  /// Hold all loads that need to be speculative.
+  SmallPtrSet<const Instruction *, 4> SpeculativeLoads;
+
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 9a2c9ba63ec7e..0c4ccde68b718 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -855,16 +855,19 @@ bool llvm::canReplacePointersIfEqual(const Value *From, const Value *To,
   return isPointerAlwaysReplaceable(From, To, DL);
 }
 
-bool llvm::isDereferenceableReadOnlyLoop(
+bool llvm::isReadOnlyLoopWithSafeOrSpeculativeLoads(
     Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+    SmallVectorImpl<LoadInst *> *SpeculativeLoads,
     SmallVectorImpl<const SCEVPredicate *> *Predicates) {
   for (BasicBlock *BB : L->blocks()) {
     for (Instruction &I : *BB) {
       if (auto *LI = dyn_cast<LoadInst>(&I)) {
         if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates))
-          return false;
-      } else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
+          SpeculativeLoads->push_back(LI);
+      } else if (I.mayReadFromMemory() || I.mayWriteToMemory() ||
+                 I.mayThrow()) {
         return false;
+      }
     }
   }
   return true;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 323ab8b1ddad1..e33800a3b94d6 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -531,6 +531,11 @@ bool TargetTransformInfo::isLegalStridedLoadStore(Type *DataType,
   return TTIImpl->isLegalStridedLoadStore(DataType, Alignment);
 }
 
+bool TargetTransformInfo::isLegalSpeculativeLoad(Type *DataType,
+                                                 Align Alignment) const {
+  return TTIImpl->isLegalSpeculativeLoad(DataType, Alignment);
+}
+
 bool TargetTransformInfo::isLegalInterleavedAccessType(
     VectorType *VTy, unsigned Factor, Align Alignment,
     unsigned AddrSpace) const {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4a1db80076530..ea1d89867809e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24525,6 +24525,18 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
   return true;
 }
 
+bool RISCVTargetLowering::isLegalSpeculativeLoad(EVT DataType,
+                                                 Align Alignment) const {
+  if (!Subtarget.hasVInstructions())
+    return false;
+
+  EVT ScalarType = DataType.getScalarType();
+  if (!isLegalElementTypeForRVV(ScalarType))
+    return false;
+
+  return true;
+}
+
 MachineInstr *
 RISCVTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
                                    MachineBasicBlock::instr_iterator &MBBI,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index fb63ebcfaacea..071eb86ed35a6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -425,6 +425,10 @@ class RISCVTargetLowering : public TargetLowering {
   /// alignment is legal.
   bool isLegalStridedLoadStore(EVT DataType, Align Alignment) const;
 
+  /// Return true if a speculative load of the given result type and
+  /// alignment is legal.
+  bool isLegalSpeculativeLoad(EVT DataType, Align Alignment) const;
+
   unsigned getMaxSupportedInterleaveFactor() const override { return 8; }
 
   bool fallBackToDAGISel(const Instruction &Inst) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index b632f25b963f7..b9cac933f8f26 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -332,6 +332,11 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
     return TLI->isLegalStridedLoadStore(DataTypeVT, Alignment);
   }
 
+  bool isLegalSpeculativeLoad(Type *DataType, Align Alignment) const override {
+    EVT DataTypeVT = TLI->getValueType(DL, DataType);
+    return TLI->isLegalSpeculativeLoad(DataTypeVT, Alignment);
+  }
+
   bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
                                     Align Alignment,
                                     unsigned AddrSpace) const override {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 789047a2a28e7..eef37811cc0d6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1776,16 +1776,39 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
   assert(LatchBB->getUniquePredecessor() == SingleUncountableExitingBlock &&
          "Expected latch predecessor to be the early exiting block");
 
-  // TODO: Handle loops that may fault.
   Predicates.clear();
-  if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
-                                     &Predicates)) {
+  SmallVector<LoadInst *, 4> NonDerefLoads;
+  if (!isReadOnlyLoopWithSafeOrSpeculativeLoads(TheLoop, PSE.getSE(), DT, AC,
+                                                &NonDerefLoads, &Predicates)) {
     reportVectorizationFailure(
         "Loop may fault",
         "Cannot vectorize potentially faulting early exit loop",
         "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
     return false;
   }
+  // Check non-dereferenceable loads if any.
+  for (LoadInst *LI : NonDerefLoads) {
+    // Only support unit-stride access for now.
+    int Stride = isConsecutivePtr(LI->getType(), LI->getPointerOperand());
+    if (Stride != 1) {
+      reportVectorizationFailure("Loop contains strided unbound access",
+                                 "Cannot vectorize early exit loop with "
+                                 "speculative strided load",
+                                 "SpeculativeNonUnitStrideLoadEarlyExitLoop",
+                                 ORE, TheLoop);
+      return false;
+    }
+    if (!TTI->isLegalSpeculativeLoad(LI->getType(), LI->getAlign())) {
+      reportVectorizationFailure("Loop may fault",
+                                 "Cannot vectorize early exit loop with "
+                                 "illegal speculative load",
+                                 "IllegalSpeculativeLoadEarlyExitLoop", ORE,
+                                 TheLoop);
+      return false;
+    }
+    SpeculativeLoads.insert(LI);
+    LLVM_DEBUG(dbgs() << "LV: Found speculative load: " << *LI << "\n");
+  }
 
   [[maybe_unused]] const SCEV *SymbolicMaxBTC =
       PSE.getSymbolicMaxBackedgeTakenCount();
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 70f884016d08c..174791d432a35 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9985,6 +9985,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
+  if (!LVL.getSpeculativeLoads().empty()) {
+    reportVectorizationFailure("Auto-vectorization of loops with speculative "
+                               "load is not supported",
+                               "SpeculativeLoadsNotSupported", ORE, L);
+    return false;
+  }
+
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
   // even evaluating whether vectorization is profitable. Since we cannot modify
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/unbound-access-legality.ll b/llvm/test/Transforms/LoopVectorize/RISCV/unbound-access-legality.ll
new file mode 100644
index 0000000000000..07e64784da84b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/unbound-access-legality.ll
@@ -0,0 +1,84 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s
+
+define ptr @unsupported_data_type(ptr %first, ptr %last, i128 %value) {
+; CHECK-LABEL: LV: Checking a loop in 'unsupported_data_type'
+; CHECK:  LV: Not vectorizing: Loop may fault.
+entry:
+  %cond = icmp eq ptr %first, %last
+  br i1 %cond, label %return, label %for.body
+
+for.body:
+  %first.addr = phi ptr [ %first, %entry ], [ %first.next, %for.inc ]
+  %1 = load i128, ptr %first.addr, align 4
+  %cond2 = icmp eq i128 %1, %value
+  br i1 %cond2, label %for.end, label %for.inc
+
+for.inc:
+  %first.next = getelementptr inbounds i128, ptr %first.addr, i64 1
+  %cond3 = icmp eq ptr %first.next, %last
+  br i1 %cond3, label %for.end, label %for.body
+
+for.end:
+  %retval.ph = phi ptr [ %first.addr, %for.body ], [ %last, %for.inc ]
+  br label %return
+
+return:
+  %retval = phi ptr [ %first, %entry ], [ %retval.ph, %for.end ]
+  ret ptr %retval
+}
+
+define ptr @unbound_strided_access(ptr %first, ptr %last, i32 %value) {
+; CHECK-LABEL: LV: Checking a loop in 'unbound_strided_access'
+; CHECK:       LV: Not vectorizing: Loop contains strided unbound access.
+entry:
+  %cond = icmp eq ptr %first, %last
+  br i1 %cond, label %return, label %for.body
+
+for.body:
+  %first.addr = phi ptr [ %first, %entry ], [ %first.next, %for.inc ]
+  %1 = load i32, ptr %first.addr, align 4
+  %cond2 = icmp eq i32 %1, %value
+  br i1 %cond2, label %for.end, label %for.inc
+
+for.inc:
+  %first.next = getelementptr inbounds i32, ptr %first.addr, i64 2
+  %cond3 = icmp eq ptr %first.next, %last
+  br i1 %cond3, label %for.end, label %for.body
+
+for.end:
+  %retval.ph = phi ptr [ %first.addr, %for.body ], [ %last, %for.inc ]
+  br label %return
+
+return:
+  %retval = phi ptr [ %first, %entry ], [ %retval.ph, %for.end ]
+  ret ptr %retval
+}
+
+define ptr @single_unbound_access(ptr %first, ptr %last, i32 %value) {
+; CHECK-LABEL: LV: Checking a loop in 'single_unbound_access'
+; CHECK:       LV: We can vectorize this loop!
+; CHECK-NEXT:  LV: Not vectorizing: Auto-vectorization of loops with speculative load is not supported.
+entry:
+  %cond = icmp eq ptr %first, %last
+  br i1 %cond, label %return, label %for.body
+
+for.body:
+  %first.addr = phi ptr [ %first, %entry ], [ %first.next, %for.inc ]
+  %1 = load i32, ptr %first.addr, align 4
+  %cond2 = icmp eq i32 %1, %value
+  br i1 %cond2, label %for.end, label %for.inc
+
+for.inc:
+  %first.next = getelementptr inbounds i32, ptr %first.addr, i64 1
+  %cond3 = icmp eq ptr %first.next, %last
+  br i1 %cond3, label %for.end, label %for.body
+
+for.end:
+  %retval.ph = phi ptr [ %first.addr, %for.body ], [ %last, %for.inc ]
+  br label %return
+
+return:
+  %retval = phi ptr [ %first, %entry ], [ %retval.ph, %for.end ]
+  ret ptr %retval
+}
diff --git a/llvm/unittests/Analysis/LoadsTest.cpp b/llvm/unittests/Analysis/LoadsTest.cpp
index c4f5b22318e34..fab2aeb745ad0 100644
--- a/llvm/unittests/Analysis/LoadsTest.cpp
+++ b/llvm/unittests/Analysis/LoadsTest.cpp
@@ -195,7 +195,10 @@ loop.end:
     assert(Header->getName() == "loop");
     Loop *L = LI.getLoopFor(Header);
 
-    return isDereferenceableReadOnlyLoop(L, &SE, &DT, &AC);
+    SmallVector<LoadInst *, 4> NonDerefLoads;
+    return isReadOnlyLoopWithSafeOrSpeculativeLoads(L, &SE, &DT, &AC,
+                                                    &NonDerefLoads) &&
+           NonDerefLoads.empty();
   };
 
   ASSERT_TRUE(IsDerefReadOnlyLoop(F1));

>From 14e199f9e40019763f2e82ab7eda29b9f3fbd083 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Fri, 25 Jul 2025 16:24:16 -0700
Subject: [PATCH 2/3] Enable earlyexit loop with EVL

---
 .../Vectorize/LoopVectorizationLegality.cpp   | 10 ++-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  5 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 25 ++++++
 ...ectorize-force-tail-with-evl-early-exit.ll | 86 +++++++++++++++++++
 4 files changed, 118 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index eef37811cc0d6..39d0cd3c4a452 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1943,10 +1943,12 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
 
 bool LoopVectorizationLegality::canFoldTailByMasking() const {
   // The only loops we can vectorize without a scalar epilogue, are loops with
-  // a bottom-test and a single exiting block. We'd have to handle the fact
-  // that not every instruction executes on the last iteration.  This will
-  // require a lane mask which varies through the vector loop body.  (TODO)
-  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+  // a bottom-test and a single exiting block or those with early exits. We'd
+  // have to handle the fact that not every instruction executes on the last
+  // iteration. This will require a lane mask which varies through the vector
+  // loop body. (TODO)
+  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
+      !hasUncountableEarlyExit()) {
     LLVM_DEBUG(
         dbgs()
         << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 174791d432a35..4ace46ed8e6f5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8668,10 +8668,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // count is >= increment and a multiple of the increment.
   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
   if (!HasNUW) {
-    auto *IVInc = Plan->getVectorLoopRegion()
-                      ->getExitingBasicBlock()
-                      ->getTerminator()
-                      ->getOperand(0);
+    auto *IVInc = Plan->getCanonicalIV()->getBackedgeValue();
     assert(match(IVInc, m_VPInstruction<Instruction::Add>(
                             m_Specific(Plan->getCanonicalIV()), m_VPValue())) &&
            "Did not find the canonical IV increment");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cff43c2742a6b..aca189280a938 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2040,6 +2040,16 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
   // Replace the original terminator with BranchOnCond. We have to invert the
   // mask here because a true condition means jumping to the exit block.
   auto *NotMask = Builder.createNot(ALM, DL);
+  using namespace VPlanPatternMatch;
+  if (VPValue *IsEarlyExitTaken = nullptr; match(
+          OriginalTerminator, m_BranchOnCond(m_BinaryOr(
+                                  m_VPValue(IsEarlyExitTaken), m_VPValue())))) {
+    auto *AnyExitTaken =
+        Builder.createNaryOp(Instruction::Or, {IsEarlyExitTaken, NotMask});
+    OriginalTerminator->setOperand(0, AnyExitTaken);
+    return LaneMaskPhi;
+  }
+
   Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
   OriginalTerminator->eraseFromParent();
   return LaneMaskPhi;
@@ -2480,6 +2490,21 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
   // Skip single-iteration loop region
   if (match(LatchExitingBr, m_BranchOnCond(m_True())))
     return;
+
+  // Replace VectorTripCount used in loop with early-exits
+  if (VPValue *VPMainExitCond = nullptr;
+      match(LatchExitingBr, m_BranchOnCond(m_BinaryOr(
+                                m_VPValue(), m_VPValue(VPMainExitCond)))) &&
+      match(VPMainExitCond, m_VPInstruction<Instruction::ICmp>(
+                                m_Specific(EVLIncrement),
+                                m_Specific(&Plan.getVectorTripCount())))) {
+    // Expected pattern here is:
+    //   EMIT vp<%main.exit.cond> = icmp eq vp<%evl.next>, vp<%vtc>
+    //   EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond>
+    //   EMIT branch-on-cond vp<%exit.cond>
+    VPMainExitCond->getDefiningRecipe()->setOperand(1, Plan.getTripCount());
+    return;
+  }
   assert(LatchExitingBr &&
          match(LatchExitingBr,
                m_BranchOnCount(m_VPValue(EVLIncrement),
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll
new file mode 100644
index 0000000000000..f126771449d5d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-early-exit.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S -enable-early-exit-vectorization  %s | FileCheck %s
+
+declare void @init(ptr)
+
+define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
+; CHECK-LABEL: define i64 @multi_exiting_to_different_exits_live_in_exit_values(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [128 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[SRC]])
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ 128, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[GEP_SRC]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 10)
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP12]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 128
+; CHECK-NEXT:    [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[E2:.*]]
+; CHECK:       [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT:    br label %[[E1:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i32 [[L]], 10
+; CHECK-NEXT:    br i1 [[C_1]], label %[[E1]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[INC]] = add nuw i64 [[IV1]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i64 [[INC]], 128
+; CHECK-NEXT:    br i1 [[C_2]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[E1]]:
+; CHECK-NEXT:    [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[P1]]
+; CHECK:       [[E2]]:
+; CHECK-NEXT:    [[P2:%.*]] = phi i64 [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[P2]]
+;
+entry:
+  %src = alloca [128 x i32]
+  call void @init(ptr %src)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src
+  %c.1 = icmp eq i32 %l, 10
+  br i1 %c.1, label %e1, label %loop.latch
+
+loop.latch:
+  %inc = add nuw i64 %iv, 1
+  %c.2 = icmp eq i64 %inc, 128
+  br i1 %c.2, label %e2, label %loop.header
+
+e1:
+  %p1 = phi i64 [ 0, %loop.header ]
+  ret i64 %p1
+
+e2:
+  %p2 = phi i64 [ 1, %loop.latch ]
+  ret i64 %p2
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]}
+;.

>From 1bb79f8d061cc34f1c1c8d60f89264729c9440a9 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Wed, 12 Mar 2025 00:14:28 -0700
Subject: [PATCH 3/3] Add WidenFFLoad

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  36 +++-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  70 ++++++++
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   3 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  56 ++++++
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  69 ++++++--
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   4 +
 .../Transforms/Vectorize/VPlanVerifier.cpp    |   5 +-
 .../Transforms/LoopVectorize/RISCV/find.ll    | 161 ++++++++++++++++++
 8 files changed, 386 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/find.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4ace46ed8e6f5..8b8b194aa9af0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -402,6 +402,10 @@ static cl::opt<bool> EnableEarlyExitVectorization(
     cl::desc(
         "Enable vectorization of early exit loops with uncountable exits."));
 
+static cl::opt<bool> EnableSpeculativeLoads(
+    "enable-speculative-load", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization of loops with speculative loads."));
+
 // Likelyhood of bypassing the vectorized loop because there are zero trips left
 // after prolog. See `emitIterationCountCheck`.
 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
@@ -1365,6 +1369,9 @@ class LoopVectorizationCostModel {
     if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
         ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
       return;
+    // Do not override EVL styles for speculative loads.
+    if (!Legal->getSpeculativeLoads().empty())
+      return;
     // Override EVL styles if needed.
     // FIXME: Investigate opportunity for fixed vector factor.
     bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
@@ -4164,6 +4171,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       case VPDef::VPWidenPointerInductionSC:
       case VPDef::VPReductionPHISC:
       case VPDef::VPInterleaveSC:
+      case VPDef::VPWidenFFLoadEVLSC:
+      case VPDef::VPWidenFFLoadSC:
       case VPDef::VPWidenLoadEVLSC:
       case VPDef::VPWidenLoadSC:
       case VPDef::VPWidenStoreEVLSC:
@@ -7749,6 +7758,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
     Builder.insert(VectorPtr);
     Ptr = VectorPtr;
   }
+  if (Legal->getSpeculativeLoads().contains(I)) {
+    auto *Load = dyn_cast<LoadInst>(I);
+    return new VPWidenFFLoadRecipe(*Load, Ptr, Mask, VPIRMetadata(*Load, LVer),
+                                   I->getDebugLoc());
+  }
+
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
                                  VPIRMetadata(*Load, LVer), I->getDebugLoc());
@@ -9982,7 +9997,26 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  if (!LVL.getSpeculativeLoads().empty()) {
+  if (EnableSpeculativeLoads) {
+    // VPWidenFFLoadEVLRecipe is currently the only concrete recipe that
+    // generates speculative load intrinsics. Since it relies on the EVL
+    // transform, speculative loads are only supported when tail-folding with
+    // EVL is enabled.
+    if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL ||
+        PreferPredicateOverEpilogue !=
+            PreferPredicateTy::PredicateOrDontVectorize) {
+      reportVectorizationFailure("Auto-vectorization of loops with speculative "
+                                 "load is not enabled",
+                                 "SpeculativeLoadsDisabled", ORE, L);
+      return false;
+    }
+    if (LVL.getSpeculativeLoads().size() > 1) {
+      reportVectorizationFailure("Auto-vectorization of loops with more than 1 "
+                                 "speculative load is not enabled",
+                                 "MoreThanOneSpeculativeLoads", ORE, L);
+      return false;
+    }
+  } else if (!LVL.getSpeculativeLoads().empty()) {
     reportVectorizationFailure("Auto-vectorization of loops with speculative "
                                "load is not supported",
                                "SpeculativeLoadsNotSupported", ORE, L);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 46e55be3f643b..6bacfb4ec7337 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -559,6 +559,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPInterleaveSC:
     case VPRecipeBase::VPIRInstructionSC:
+    case VPRecipeBase::VPWidenFFLoadEVLSC:
+    case VPRecipeBase::VPWidenFFLoadSC:
     case VPRecipeBase::VPWidenLoadEVLSC:
     case VPRecipeBase::VPWidenLoadSC:
     case VPRecipeBase::VPWidenStoreEVLSC:
@@ -3026,6 +3028,8 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
   static inline bool classof(const VPRecipeBase *R) {
     return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
+           R->getVPDefID() == VPRecipeBase::VPWidenFFLoadSC ||
+           R->getVPDefID() == VPRecipeBase::VPWidenFFLoadEVLSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
   }
@@ -3107,6 +3111,72 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
   }
 };
 
+struct VPWidenFFLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
+  VPWidenFFLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
+                      const VPIRMetadata &Metadata, DebugLoc DL)
+      : VPWidenMemoryRecipe(VPDef::VPWidenFFLoadSC, Load, {Addr},
+                            /*Consecutive*/ true, /*Reverse*/ false, Metadata,
+                            DL),
+        VPValue(this, &Load) {
+    setMask(Mask);
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenFFLoadSC);
+
+  void execute(VPTransformState &State) override {
+    llvm_unreachable("cannot execute this recipe, should be replaced by "
+                     "VPWidenFFLoadEVLRecipe");
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return Op == getAddr();
+  }
+};
+
+struct VPWidenFFLoadEVLRecipe final : public VPWidenMemoryRecipe,
+                                      public VPValue {
+  VPWidenFFLoadEVLRecipe(VPWidenFFLoadRecipe &L, VPValue &EVL, VPValue *Mask)
+      : VPWidenMemoryRecipe(VPDef::VPWidenFFLoadEVLSC, L.getIngredient(),
+                            {L.getAddr(), &EVL}, true, false, L,
+                            L.getDebugLoc()),
+        VPValue(this, &getIngredient()) {
+    new VPValue(nullptr, this); // newVL
+    setMask(Mask);
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenFFLoadEVLSC);
+
+  /// Return the EVL operand.
+  VPValue *getEVL() const { return getOperand(1); }
+
+  /// Generate a wide load or gather.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    // Widened, consecutive loads operations only demand the first lane of
+    // their address.
+    return Op == getEVL() || Op == getAddr();
+  }
+};
+
 /// A recipe for widening load operations with vector-predication intrinsics,
 /// using the address to load from, the explicit vector length and an optional
 /// mask.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index b39231f106300..a5502aff92ac1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -187,7 +187,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
 }
 
 Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
-  assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
+  assert((isa<VPWidenLoadRecipe, VPWidenFFLoadRecipe, VPWidenFFLoadEVLRecipe,
+              VPWidenLoadEVLRecipe>(R)) &&
          "Store recipes should not define any values");
   return cast<LoadInst>(&R->getIngredient())->getType();
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7ca9b230f5aae..238583cc4ade2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -84,6 +84,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPWidenIntOrFpInductionSC:
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
+  case VPWidenFFLoadEVLSC:
+  case VPWidenFFLoadSC:
   case VPWidenPHISC:
   case VPWidenSC:
   case VPWidenSelectSC: {
@@ -107,6 +109,8 @@ bool VPRecipeBase::mayReadFromMemory() const {
     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
+  case VPWidenFFLoadEVLSC:
+  case VPWidenFFLoadSC:
     return true;
   case VPReplicateSC:
     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
@@ -184,6 +188,9 @@ bool VPRecipeBase::mayHaveSideEffects() const {
            "underlying instruction has side-effects");
     return false;
   }
+  case VPWidenFFLoadEVLSC:
+  case VPWidenFFLoadSC:
+    return true;
   case VPInterleaveSC:
     return mayWriteToMemory();
   case VPWidenLoadEVLSC:
@@ -3224,6 +3231,55 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPWidenFFLoadEVLRecipe::execute(VPTransformState &State) {
+  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+  auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+  const Align Alignment = getLoadStoreAlignment(&Ingredient);
+
+  auto &Builder = State.Builder;
+  State.setDebugLocFrom(getDebugLoc());
+
+  Value *EVL = State.get(getEVL(), VPLane(0));
+  Value *Addr = State.get(getAddr(), true);
+  Value *Mask = nullptr;
+  if (VPValue *VPMask = getMask())
+    Mask = State.get(VPMask);
+  else
+    Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+  CallInst *NewLI =
+      Builder.CreateIntrinsic(Intrinsic::vp_load_ff, {DataTy, Addr->getType()},
+                              {Addr, Mask, EVL}, nullptr, "vp.op.load.ff");
+  NewLI->addParamAttr(
+      0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+  applyMetadata(*NewLI);
+  Value *V = cast<Instruction>(Builder.CreateExtractValue(NewLI, 0));
+  Value *VL = Builder.CreateExtractValue(NewLI, 1);
+  State.set(getVPValue(0), V);
+  State.set(getVPValue(1), VL, /*NeedsScalar=*/true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenFFLoadRecipe::print(raw_ostream &O, const Twine &Indent,
+                                VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN ";
+  printAsOperand(O, SlotTracker);
+  O << " = fault-only-first-load ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenFFLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+                                   VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN ";
+  printAsOperand(O, SlotTracker);
+  O << ", ";
+  getVPValue(1)->printAsOperand(O, SlotTracker);
+  O << " = vp.load.ff ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
 /// Use all-true mask for reverse rather than actual mask, as it avoids a
 /// dependence w/o affecting the result.
 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index aca189280a938..11f2e6a5d4307 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2184,6 +2184,10 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
         VPValue *NewAddr = GetNewAddr(L->getAddr());
         return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
       })
+      .Case<VPWidenFFLoadRecipe>([&](VPWidenFFLoadRecipe *L) {
+        VPValue *NewMask = GetNewMask(L->getMask());
+        return new VPWidenFFLoadEVLRecipe(*L, EVL, NewMask);
+      })
       .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
         VPValue *NewMask = GetNewMask(S->getMask());
         VPValue *NewAddr = GetNewAddr(S->getAddr());
@@ -2212,7 +2216,7 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
 }
 
 /// Replace recipes with their EVL variants.
-static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
+static VPValue *transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   VPTypeAnalysis TypeInfo(Plan);
   VPValue *AllOneMask = Plan.getTrue();
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
@@ -2222,9 +2226,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
                 IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
                         VPWidenIntOrFpInductionRecipe>) &&
          "User of VF that we can't transform to EVL.");
-  Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
-    return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);
-  });
 
   assert(all_of(Plan.getVFxUF().users(),
                 [&Plan](VPUser *U) {
@@ -2247,6 +2248,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
 
   // Create a scalar phi to track the previous EVL if fixed-order recurrence is
   // contained.
+  VPInstruction *PrevEVL = nullptr;
   bool ContainsFORs =
       any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>);
   if (ContainsFORs) {
@@ -2259,8 +2261,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
         TypeInfo.inferScalarType(MaxEVL), DebugLoc());
 
     Builder.setInsertPoint(Header, Header->getFirstNonPhi());
-    VPValue *PrevEVL =
-        Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl");
+    // Postpone backedge insertion until LastEVL is known.
+    PrevEVL = Builder.createScalarPhi({MaxEVL}, DebugLoc(), "prev.evl");
 
     for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
              vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
@@ -2285,7 +2287,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
 
   VPValue *HeaderMask = findHeaderMask(Plan);
   if (!HeaderMask)
-    return;
+    return &EVL;
 
   // Replace header masks with a mask equivalent to predicating by EVL:
   //
@@ -2313,15 +2315,21 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
       continue;
 
     [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
-    assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
-           "New recipe must define the same number of values as the "
-           "original.");
-    assert(NumDefVal <= 1 &&
-           "Only supports recipes with a single definition or without users.");
+    if (!isa<VPWidenFFLoadEVLRecipe>(EVLRecipe)) {
+      assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
+             "New recipe must define the same number of values as the "
+             "original.");
+      assert(
+          NumDefVal <= 1 &&
+          "Only supports recipes with a single definition or without users.");
+    }
     EVLRecipe->insertBefore(CurRecipe);
     if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(EVLRecipe)) {
       VPValue *CurVPV = CurRecipe->getVPSingleValue();
       CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue());
+    } else if (isa<VPWidenFFLoadEVLRecipe>(EVLRecipe)) {
+      VPValue *CurVPV = CurRecipe->getVPSingleValue();
+      CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(0));
     }
     ToErase.push_back(CurRecipe);
   }
@@ -2329,12 +2337,44 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   if (EVLMask->getNumUsers() == 0)
     ToErase.push_back(EVLMask->getDefiningRecipe());
 
+  // Find LastEVL
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+      Plan.getEntry());
+  VPValue *LastEVL = nullptr;
+
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+    for (VPRecipeBase &CurRecipe : *VPBB) {
+      auto *VPI = dyn_cast<VPInstruction>(&CurRecipe);
+      if (VPI && (VPI->getOpcode() == VPInstruction::ExplicitVectorLength)) {
+        assert((LastEVL == nullptr) && "EVL should be set only once");
+        LastEVL = VPI;
+        continue;
+      }
+      if (!LastEVL)
+        continue;
+      if (isa<VPWidenFFLoadEVLRecipe>(CurRecipe)) {
+        LastEVL = CurRecipe.getVPValue(1);
+      }
+    }
+  }
+
+  Plan.getVF().replaceUsesWithIf(LastEVL, [](VPUser &U, unsigned Idx) {
+    return isa<VPWidenIntOrFpInductionRecipe>(U);
+  });
+  Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
+    return isa<VPScalarIVStepsRecipe>(U);
+  });
+
+  if (PrevEVL)
+    PrevEVL->addOperand(LastEVL);
+
   for (VPRecipeBase *R : reverse(ToErase)) {
     SmallVector<VPValue *> PossiblyDead(R->operands());
     R->eraseFromParent();
     for (VPValue *Op : PossiblyDead)
       recursivelyDeleteDeadRecipes(Op);
   }
+  return LastEVL;
 }
 
 /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
@@ -2408,10 +2448,11 @@ void VPlanTransforms::addExplicitVectorLength(
   auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
                                      DebugLoc());
 
+  VPValue *OpVPEVL = transformRecipestoEVLRecipes(Plan, *VPEVL);
+
   auto *CanonicalIVIncrement =
       cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
   Builder.setInsertPoint(CanonicalIVIncrement);
-  VPValue *OpVPEVL = VPEVL;
 
   auto *I32Ty = Type::getInt32Ty(Plan.getContext());
   OpVPEVL = Builder.createScalarZExtOrTrunc(
@@ -2429,8 +2470,6 @@ void VPlanTransforms::addExplicitVectorLength(
       DebugLoc::getCompilerGenerated(), "avl.next");
   AVLPhi->addOperand(NextAVL);
 
-  transformRecipestoEVLRecipes(Plan, *VPEVL);
-
   // Replace all uses of VPCanonicalIVPHIRecipe by
   // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
   CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 24f6d61512ef6..179e557731ce9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -40,6 +40,7 @@ class VPUser;
 class VPRecipeBase;
 class VPInterleaveRecipe;
 class VPPhiAccessors;
+class VPWidenFFLoadEVLRecipe;
 
 // This is the base class of the VPlan Def/Use graph, used for modeling the data
 // flow into, within and out of the VPlan. VPValues can stand for live-ins
@@ -51,6 +52,7 @@ class LLVM_ABI_FOR_TEST VPValue {
   friend class VPInterleaveRecipe;
   friend class VPlan;
   friend class VPExpressionRecipe;
+  friend class VPWidenFFLoadEVLRecipe;
 
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
@@ -348,6 +350,8 @@ class VPDef {
     VPWidenCastSC,
     VPWidenGEPSC,
     VPWidenIntrinsicSC,
+    VPWidenFFLoadSC,
+    VPWidenFFLoadEVLSC,
     VPWidenLoadEVLSC,
     VPWidenLoadSC,
     VPWidenStoreEVLSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index e25ffe135418e..49283c2a44605 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -166,7 +166,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
           }
           return VerifyEVLUse(*R, 2);
         })
-        .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe>(
+        .Case<VPWidenLoadEVLRecipe, VPWidenFFLoadEVLRecipe,
+              VPVectorEndPointerRecipe>(
             [&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
         .Case<VPInstructionWithType>(
             [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
@@ -175,6 +176,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
               I->getOpcode() == Instruction::ICmp ||
               I->getOpcode() == Instruction::Sub)
             return VerifyEVLUse(*I, 1);
+          if (I->getOpcode() == Instruction::Sub)
+            return VerifyEVLUse(*I, 0);
           switch (I->getOpcode()) {
           case Instruction::Add:
             break;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/find.ll b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
new file mode 100644
index 0000000000000..18f40da1ec710
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -enable-speculative-load -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s
+
+define ptr @find_with_liveout(ptr %first, ptr %last, ptr %value) {
+; CHECK-LABEL: define ptr @find_with_liveout(
+; CHECK-SAME: ptr [[FIRST:%.*]], ptr [[LAST:%.*]], ptr [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_NOT6:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
+; CHECK-NEXT:    br i1 [[CMP_NOT6]], label %[[RETURN:.*]], label %[[FOR_BODY_LR_PH:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[VALUE]], align 4
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[FIRST_ADDR_07:%.*]] = phi ptr [ [[FIRST]], %[[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[FIRST_ADDR_07]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[RETURN_LOOPEXIT:.*]], label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[FIRST_ADDR_07]], i64 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[LAST]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[RETURN_LOOPEXIT]], label %[[FOR_BODY]]
+; CHECK:       [[RETURN_LOOPEXIT]]:
+; CHECK-NEXT:    [[RETVAL_0_PH:%.*]] = phi ptr [ [[FIRST_ADDR_07]], %[[FOR_BODY]] ], [ [[LAST]], %[[FOR_INC]] ]
+; CHECK-NEXT:    br label %[[RETURN]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi ptr [ [[FIRST]], %[[ENTRY]] ], [ [[RETVAL_0_PH]], %[[RETURN_LOOPEXIT]] ]
+; CHECK-NEXT:    ret ptr [[RETVAL_0]]
+;
+entry:
+  %cmp.not6 = icmp eq ptr %first, %last
+  br i1 %cmp.not6, label %return, label %for.body.lr.ph
+
+for.body.lr.ph:
+  %0 = load i32, ptr %value, align 4
+  br label %for.body
+
+for.body:
+  %first.addr.07 = phi ptr [ %first, %for.body.lr.ph ], [ %incdec.ptr, %for.inc ]
+  %1 = load i32, ptr %first.addr.07, align 4
+  %cmp1 = icmp eq i32 %1, %0
+  br i1 %cmp1, label %return.loopexit, label %for.inc
+
+for.inc:
+  %incdec.ptr = getelementptr inbounds i32, ptr %first.addr.07, i64 1
+  %cmp.not = icmp eq ptr %incdec.ptr, %last
+  br i1 %cmp.not, label %return.loopexit, label %for.body
+
+return.loopexit:
+  %retval.0.ph = phi ptr [ %first.addr.07, %for.body ], [ %last, %for.inc ]
+  br label %return
+
+return:
+  %retval.0 = phi ptr [ %first, %entry ], [ %retval.0.ph, %return.loopexit ]
+  ret ptr %retval.0
+}
+
+define i32 @find_without_liveout(ptr %first, ptr %last, ptr %value) {
+; CHECK-LABEL: define i32 @find_without_liveout(
+; CHECK-SAME: ptr [[FIRST:%.*]], ptr [[LAST:%.*]], ptr [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[FIRST4:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT:    [[LAST3:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT:    [[FIRST2:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT:    [[LAST1:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT:    [[CMP_NOT6:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
+; CHECK-NEXT:    br i1 [[CMP_NOT6]], label %[[RETURN:.*]], label %[[FOR_BODY_LR_PH:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[VALUE]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[LAST3]], -4
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], [[FIRST4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[LAST1]] to i2
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[FIRST2]] to i2
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i2 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i2 [[TMP7]] to i64
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP4]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[EVL_BASED_IV]], 4
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[VP_OP_LOAD_FF:%.*]] = call { <vscale x 4 x i32>, i32 } @llvm.vp.load.ff.nxv4i32.p0(ptr align 4 [[NEXT_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, i32 } [[VP_OP_LOAD_FF]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq <vscale x 4 x i32> [[TMP16]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[TMP4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or i1 [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[RETURN_LOOPEXIT:.*]]
+; CHECK:       [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT:    br label %[[RETURN_LOOPEXIT]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[FIRST]], %[[FOR_BODY_LR_PH]] ], [ [[FIRST]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[FIRST_ADDR_07:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[FIRST_ADDR_07]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP23]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[RETURN_LOOPEXIT]], label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[FIRST_ADDR_07]], i64 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[LAST]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[RETURN_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[RETURN_LOOPEXIT]]:
+; CHECK-NEXT:    [[RETVAL_0_PH:%.*]] = phi i32 [ 0, %[[FOR_BODY]] ], [ 1, %[[FOR_INC]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    br label %[[RETURN]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RETVAL_0_PH]], %[[RETURN_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+entry:
+  %cmp.not6 = icmp eq ptr %first, %last
+  br i1 %cmp.not6, label %return, label %for.body.lr.ph
+
+for.body.lr.ph:
+  %0 = load i32, ptr %value, align 4
+  br label %for.body
+
+for.body:
+  %first.addr.07 = phi ptr [ %first, %for.body.lr.ph ], [ %incdec.ptr, %for.inc ]
+  %1 = load i32, ptr %first.addr.07, align 4
+  %cmp1 = icmp eq i32 %1, %0
+  br i1 %cmp1, label %return.loopexit, label %for.inc
+
+for.inc:
+  %incdec.ptr = getelementptr inbounds i32, ptr %first.addr.07, i64 1
+  %cmp.not = icmp eq ptr %incdec.ptr, %last
+  br i1 %cmp.not, label %return.loopexit, label %for.body
+
+return.loopexit:
+  %retval.0.ph = phi i32 [ 0, %for.body ], [ 1, %for.inc ]
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 0, %entry ], [ %retval.0.ph, %return.loopexit ]
+  ret i32 %retval.0
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
+;.