[llvm] [VPlan] Enable vectorization of early-exit loops with unit-stride fault-only-first loads (PR #151300)

Sat Sep 20 18:43:08 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-risc-v

Author: Shih-Po Hung (arcbbb)

<details>
<summary>Changes</summary>

Following #152422, this patch enables auto-vectorization of early-exit loops containing a single potentially faulting, unit-stride load by using the vp.load.ff intrinsic introduced in #128593.

Key changes:
- Add VPWidenFFLoadRecipe that produces two results: the loaded vector and VL (the number of lanes actually loaded).
- Use VL to ensure correctness:
   - Step the induction variable by VL (variable-length stepping).
   - Cap AVL to min(VF, remainder) to avoid over-reads at last iteration.
   - Compute the early-exit condition from VL by replacing AnyOf with vp.reduce.or to avoid branch-on-poison.
- Introduce two transforms:
   - adjustFFLoadEarlyExitForPoisonSafety: rewrites the exit condition (AnyOf → vp.reduce.or(VL)) and sets AVL = min(VF, remainder).
   - convertFFLoadEarlyExitToVLStepping: after region dissolution, converts early-exit loops to step by VL.

Limitations:
- Supports a single potentially faulting load with unit stride.
- Interleave count (IC) must be 1.

---

Patch is 34.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151300.diff


9 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+41-1) 
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+45) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+3-2) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+43) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+96) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+11) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanValue.h (+3) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp (+2-2) 
- (added) llvm/test/Transforms/LoopVectorize/RISCV/find.ll (+236) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1d3cffa2b61bf..e28d4c45d4ab8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -393,6 +393,12 @@ static cl::opt<bool> EnableEarlyExitVectorization(
     cl::desc(
         "Enable vectorization of early exit loops with uncountable exits."));
 
+static cl::opt<bool>
+    EnableEarlyExitWithFFLoads("enable-early-exit-with-ffload", cl::init(false),
+                               cl::Hidden,
+                               cl::desc("Enable vectorization of early-exit "
+                                        "loops with fault-only-first loads."));
+
 static cl::opt<bool> ConsiderRegPressure(
     "vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
     cl::desc("Discard VFs if their register pressure is too high."));
@@ -3507,6 +3513,15 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return FixedScalableVFPair::getNone();
   }
 
+  if (!Legal->getPotentiallyFaultingLoads().empty() && UserIC > 1) {
+    reportVectorizationFailure("Auto-vectorization of loops with potentially "
+                               "faulting loads is not supported when the "
+                               "interleave count is more than 1",
+                               "CantInterleaveLoopWithPotentiallyFaultingLoads",
+                               ORE, TheLoop);
+    return FixedScalableVFPair::getNone();
+  }
+
   ScalarEvolution *SE = PSE.getSE();
   ElementCount TC = getSmallConstantTripCount(SE, TheLoop);
   unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
@@ -4076,6 +4091,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       case VPDef::VPReductionPHISC:
       case VPDef::VPInterleaveEVLSC:
       case VPDef::VPInterleaveSC:
+      case VPDef::VPWidenFFLoadSC:
       case VPDef::VPWidenLoadEVLSC:
       case VPDef::VPWidenLoadSC:
       case VPDef::VPWidenStoreEVLSC:
@@ -4550,6 +4566,10 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
   if (!Legal->isSafeForAnyVectorWidth())
     return 1;
 
+  // No interleaving for potentially faulting loads.
+  if (!Legal->getPotentiallyFaultingLoads().empty())
+    return 1;
+
   // We don't attempt to perform interleaving for loops with uncountable early
   // exits because the VPInstruction::AnyOf code cannot currently handle
   // multiple parts.
@@ -7216,6 +7236,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // Regions are dissolved after optimizing for VF and UF, which completely
   // removes unneeded loop regions first.
   VPlanTransforms::dissolveLoopRegions(BestVPlan);
+
+  VPlanTransforms::convertFFLoadEarlyExitToVLStepping(BestVPlan);
+
   // Canonicalize EVL loops after regions are dissolved.
   VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
   VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH);
@@ -7598,6 +7621,10 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
     Builder.insert(VectorPtr);
     Ptr = VectorPtr;
   }
+  if (Legal->getPotentiallyFaultingLoads().contains(I))
+    return new VPWidenFFLoadRecipe(*cast<LoadInst>(I), Ptr, &Plan.getVF(), Mask,
+                                   VPIRMetadata(*I, LVer), I->getDebugLoc());
+
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
                                  VPIRMetadata(*Load, LVer), I->getDebugLoc());
@@ -8632,6 +8659,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
       if (Recipe->getNumDefinedValues() == 1) {
         SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
         Old2New[SingleDef] = Recipe->getVPSingleValue();
+      } else if (isa<VPWidenFFLoadRecipe>(Recipe)) {
+        VPValue *Data = Recipe->getVPValue(0);
+        SingleDef->replaceAllUsesWith(Data);
+        Old2New[SingleDef] = Data;
       } else {
         assert(Recipe->getNumDefinedValues() == 0 &&
                "Unexpected multidef recipe");
@@ -8679,6 +8710,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // Adjust the recipes for any inloop reductions.
   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
 
+  VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(*Plan);
+
   // Apply mandatory transformation to handle FP maxnum/minnum reduction with
   // NaNs if possible, bail out otherwise.
   if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
@@ -9869,7 +9902,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  if (!LVL.getPotentiallyFaultingLoads().empty()) {
+  if (EnableEarlyExitWithFFLoads) {
+    if (LVL.getPotentiallyFaultingLoads().size() > 1) {
+      reportVectorizationFailure("Auto-vectorization of loops with more than 1 "
+                                 "potentially faulting load is not enabled",
+                                 "MoreThanOnePotentiallyFaultingLoad", ORE, L);
+      return false;
+    }
+  } else if (!LVL.getPotentiallyFaultingLoads().empty()) {
     reportVectorizationFailure("Auto-vectorization of loops with potentially "
                                "faulting load is not supported",
                                "PotentiallyFaultingLoadsNotSupported", ORE, L);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f79855f7e2c5f..6e28c95ca601a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -563,6 +563,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPInterleaveEVLSC:
     case VPRecipeBase::VPInterleaveSC:
     case VPRecipeBase::VPIRInstructionSC:
+    case VPRecipeBase::VPWidenFFLoadSC:
     case VPRecipeBase::VPWidenLoadEVLSC:
     case VPRecipeBase::VPWidenLoadSC:
     case VPRecipeBase::VPWidenStoreEVLSC:
@@ -2811,6 +2812,13 @@ class LLVM_ABI_FOR_TEST VPReductionEVLRecipe : public VPReductionRecipe {
             ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp,
             R.isOrdered(), DL) {}
 
+  VPReductionEVLRecipe(RecurKind RdxKind, FastMathFlags FMFs, VPValue *ChainOp,
+                       VPValue *VecOp, VPValue &EVL, VPValue *CondOp,
+                       bool IsOrdered, DebugLoc DL = DebugLoc::getUnknown())
+      : VPReductionRecipe(VPDef::VPReductionEVLSC, RdxKind, FMFs, nullptr,
+                          ArrayRef<VPValue *>({ChainOp, VecOp, &EVL}), CondOp,
+                          IsOrdered, DL) {}
+
   ~VPReductionEVLRecipe() override = default;
 
   VPReductionEVLRecipe *clone() override {
@@ -3159,6 +3167,7 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
   static inline bool classof(const VPRecipeBase *R) {
     return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
+           R->getVPDefID() == VPRecipeBase::VPWidenFFLoadSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
            R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
   }
@@ -3240,6 +3249,42 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
   }
 };
 
+/// A recipe for widening loads using fault-only-first intrinsics.
+/// Produces two results: (1) the loaded data, and (2) the index of the first
+/// non-dereferenceable lane, or VF if all lanes are successfully read.
+struct VPWidenFFLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
+  VPWidenFFLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *VF, VPValue *Mask,
+                      const VPIRMetadata &Metadata, DebugLoc DL)
+      : VPWidenMemoryRecipe(VPDef::VPWidenFFLoadSC, Load, {Addr, VF},
+                            /*Consecutive*/ true, /*Reverse*/ false, Metadata,
+                            DL),
+        VPValue(this, &Load) {
+    new VPValue(nullptr, this); // Index of the first lane that faults.
+    setMask(Mask);
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenFFLoadSC);
+
+  /// Return the VF operand.
+  VPValue *getVF() const { return getOperand(1); }
+  void setVF(VPValue *V) { setOperand(1, V); }
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return Op == getVF() || Op == getAddr();
+  }
+};
+
 /// A recipe for widening load operations with vector-predication intrinsics,
 /// using the address to load from, the explicit vector length and an optional
 /// mask.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 46ab7712e2671..684dbd25597e3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -188,8 +188,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
 }
 
 Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
-  assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
-         "Store recipes should not define any values");
+  assert(
+      (isa<VPWidenLoadRecipe, VPWidenFFLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
+      "Store recipes should not define any values");
   return cast<LoadInst>(&R->getIngredient())->getType();
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8e9c3db50319f..3da8613a1d3cc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -73,6 +73,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPReductionPHISC:
   case VPScalarIVStepsSC:
   case VPPredInstPHISC:
+  case VPWidenFFLoadSC:
     return false;
   case VPBlendSC:
   case VPReductionEVLSC:
@@ -107,6 +108,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
     return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
   case VPWidenLoadEVLSC:
   case VPWidenLoadSC:
+  case VPWidenFFLoadSC:
     return true;
   case VPReplicateSC:
     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
@@ -3409,6 +3411,47 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPWidenFFLoadRecipe::execute(VPTransformState &State) {
+  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+  auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+  const Align Alignment = getLoadStoreAlignment(&Ingredient);
+
+  auto &Builder = State.Builder;
+  State.setDebugLocFrom(getDebugLoc());
+
+  Value *VL = State.get(getVF(), VPLane(0));
+  Type *I32Ty = Builder.getInt32Ty();
+  VL = Builder.CreateZExtOrTrunc(VL, I32Ty);
+  Value *Addr = State.get(getAddr(), true);
+  Value *Mask = nullptr;
+  if (VPValue *VPMask = getMask())
+    Mask = State.get(VPMask);
+  else
+    Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+  CallInst *NewLI =
+      Builder.CreateIntrinsic(Intrinsic::vp_load_ff, {DataTy, Addr->getType()},
+                              {Addr, Mask, VL}, nullptr, "vp.op.load.ff");
+  NewLI->addParamAttr(
+      0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+  applyMetadata(*NewLI);
+  Value *V = cast<Instruction>(Builder.CreateExtractValue(NewLI, 0));
+  Value *NewVL = Builder.CreateExtractValue(NewLI, 1);
+  State.set(getVPValue(0), V);
+  State.set(getVPValue(1), NewVL, /*NeedsScalar=*/true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenFFLoadRecipe::print(raw_ostream &O, const Twine &Indent,
+                                VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN ";
+  printAsOperand(O, SlotTracker);
+  O << ", ";
+  getVPValue(1)->printAsOperand(O, SlotTracker);
+  O << " = vp.load.ff ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
 /// Use all-true mask for reverse rather than actual mask, as it avoids a
 /// dependence w/o affecting the result.
 static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1f6b85270607e..7e78cb6ed02ac 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2760,6 +2760,102 @@ void VPlanTransforms::addExplicitVectorLength(
   Plan.setUF(1);
 }
 
+void VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(VPlan &Plan) {
+  VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+  VPWidenFFLoadRecipe *LastFFLoad = nullptr;
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_deep(Plan.getVectorLoopRegion())))
+    for (VPRecipeBase &R : *VPBB)
+      if (auto *Load = dyn_cast<VPWidenFFLoadRecipe>(&R)) {
+        assert(!LastFFLoad && "Only one FFLoad is supported");
+        LastFFLoad = Load;
+      }
+
+  // Skip if no FFLoad.
+  if (!LastFFLoad)
+    return;
+
+  // Ensure FFLoad does not read past the remainder in the last iteration.
+  // Set AVL to min(VF, remainder).
+  VPBuilder Builder(Header, Header->getFirstNonPhi());
+  VPValue *Remainder = Builder.createNaryOp(
+      Instruction::Sub, {&Plan.getVectorTripCount(), Plan.getCanonicalIV()});
+  VPValue *Cmp =
+      Builder.createICmp(CmpInst::ICMP_ULE, &Plan.getVF(), Remainder);
+  VPValue *AVL = Builder.createSelect(Cmp, &Plan.getVF(), Remainder);
+  LastFFLoad->setVF(AVL);
+
+  // To prevent branch-on-poison, rewrite the early-exit condition to
+  // VPReductionEVLRecipe. Expected pattern here is:
+  //   EMIT vp<%alt.exit.cond> = AnyOf
+  //   EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond>
+  //   EMIT branch-on-cond vp<%exit.cond>
+  auto *ExitingLatch = cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
+  auto *LatchExitingBr = cast<VPInstruction>(ExitingLatch->getTerminator());
+
+  VPValue *VPAnyOf = nullptr;
+  VPValue *VecOp = nullptr;
+  assert(
+      match(LatchExitingBr,
+            m_BranchOnCond(m_BinaryOr(m_VPValue(VPAnyOf), m_VPValue()))) &&
+      match(VPAnyOf, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(VecOp))) &&
+      "unexpected exiting sequence in early exit loop");
+
+  VPValue *OpVPEVLI32 = LastFFLoad->getVPValue(1);
+  VPValue *Mask = LastFFLoad->getMask();
+  FastMathFlags FMF;
+  auto *I1Ty = Type::getInt1Ty(Plan.getContext());
+  VPValue *VPZero = Plan.getOrAddLiveIn(ConstantInt::get(I1Ty, 0));
+  DebugLoc DL = VPAnyOf->getDefiningRecipe()->getDebugLoc();
+  auto *NewAnyOf =
+      new VPReductionEVLRecipe(RecurKind::Or, FMF, VPZero, VecOp, *OpVPEVLI32,
+                               Mask, /*IsOrdered*/ false, DL);
+  NewAnyOf->insertBefore(VPAnyOf->getDefiningRecipe());
+  VPAnyOf->replaceAllUsesWith(NewAnyOf);
+
+  // Using FirstActiveLane in the early-exit block is safe,
+  // exiting conditions guarantees at least one valid lane precedes
+  // any poisoned lanes.
+}
+
+void VPlanTransforms::convertFFLoadEarlyExitToVLStepping(VPlan &Plan) {
+  // Find loop header by locating VPWidenFFLoadRecipe.
+  VPWidenFFLoadRecipe *LastFFLoad = nullptr;
+
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getEntry())))
+    for (VPRecipeBase &R : *VPBB)
+      if (auto *Load = dyn_cast<VPWidenFFLoadRecipe>(&R)) {
+        assert(!LastFFLoad && "Only one FFLoad is supported");
+        LastFFLoad = Load;
+      }
+
+  // Skip if no FFLoad.
+  if (!LastFFLoad)
+    return;
+
+  VPBasicBlock *HeaderVPBB = LastFFLoad->getParent();
+  // Replace IVStep (VFxUF) with returned VL from FFLoad.
+  auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
+  VPValue *Backedge = CanonicalIV->getIncomingValue(1);
+  assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
+                                 m_Specific(&Plan.getVFxUF()))) &&
+         "Unexpected canonical iv");
+  VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
+  VPValue *OpVPEVLI32 = LastFFLoad->getVPValue(1);
+  VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
+  Builder.setInsertPoint(CanonicalIVIncrement);
+  auto *TC = Plan.getTripCount();
+  Type *CanIVTy = TC->isLiveIn()
+                      ? TC->getLiveInIRValue()->getType()
+                      : cast<VPExpandSCEVRecipe>(TC)->getSCEV()->getType();
+  auto *I32Ty = Type::getInt32Ty(Plan.getContext());
+  VPValue *OpVPEVL = Builder.createScalarZExtOrTrunc(
+      OpVPEVLI32, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
+
+  CanonicalIVIncrement->setOperand(1, OpVPEVL);
+}
+
 void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
   // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
   // There should be only one EVL PHI in the entire plan.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 69452a7e37572..bc5ce3bc43e76 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -269,6 +269,17 @@ struct VPlanTransforms {
   ///      (branch-on-cond eq AVLNext, 0)
   static void canonicalizeEVLLoops(VPlan &Plan);
 
+  /// Applies to early-exit loops that use FFLoad. FFLoad may yield fewer active
+  /// lanes than VF. To prevent branch-on-poison and over-reads past the vector
+  /// trip count, use the returned VL for both stepping and exit computation.
+  /// Implemented by:
+  ///  - adjustFFLoadEarlyExitForPoisonSafety: replace AnyOf with vp.reduce.or over
+  ///    the first VL lanes; set AVL = min(VF, remainder).
+  ///  - convertFFLoadEarlyExitToVLStepping: after region dissolution, convert
+  ///    early-exit loops to variable-length stepping.
+  static void adjustFFLoadEarlyExitForPoisonSafety(VPlan &Plan);
+  static void convertFFLoadEarlyExitToVLStepping(VPlan &Plan);
+
   /// Lower abstract recipes to concrete ones, that can be codegen'd.
   static void convertToConcreteRecipes(VPlan &Plan);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 0678bc90ef4b5..b2bc430a09686 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -40,6 +40,7 @@ class VPUser;
 class VPRecipeBase;
 class VPInterleaveBase;
 class VPPhiAccessors;
+class VPWidenFFLoadRecipe;
 
 // This is the base class of the VPlan Def/Use graph, used for modeling the data
 // flow into, within and out of the VPlan. VPValues can stand for live-ins
@@ -51,6 +52,7 @@ class LLVM_ABI_FOR_TEST VPValue {
   friend class VPInterleaveBase;
   friend class VPlan;
   friend class VPExpressionRecipe;
+  friend class VPWidenFFLoadRecipe;
 
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
@@ -351,6 +353,7 @@ class VPDef {
     VPWidenCastSC,
     VPWidenGEPSC,
     VPWidenIntrinsicSC,
+    VPWidenFFLoadSC,
     VPWidenLoadEVLSC,
     VPWidenLoadSC,
     VPWidenStoreEVLSC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 92caa0b4e51d5..70e6e0d006eb6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -166,8 +166,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
           }
           return VerifyEVLUse(*R, 2);
         })
-        .Case<VPWidenLoadEVLRecipe, VPVectorEndPointerRecipe,
-              VPInterleaveEVLRecipe>(
+        .Case<VPWidenLoadEVLRecipe, VPWidenFFLoadRecipe,
+              VPVectorEndPointerRecipe, VPInterleaveEVLRecipe>(
             [&](const VPRecipeBase *R) { return VerifyEVLUse(*R, 1); })
         .Case<VPInstructionWithType>(
             [&](const VPInstructionWithType *S) { return VerifyEVLUse(*S, 0); })
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/find.ll b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
new file mode 100644
index 0000000000000..f734bd5f53c82
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/find.ll
@@ -0...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/151300