[llvm] [VPlan] Enable vectorization of early-exit loops with unit-stride fault-only-first loads (PR #151300)

Fri Dec 12 01:53:46 PST 2025

================
@@ -3144,6 +3144,137 @@ void VPlanTransforms::addExplicitVectorLength(
   Plan.setUF(1);
 }
 
+void VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(VPlan &Plan) {
+  using namespace SCEVPatternMatch;
+  VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+  VPWidenIntrinsicRecipe *LastFFLoad = nullptr;
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_deep(Plan.getVectorLoopRegion())))
+    for (VPRecipeBase &R : *VPBB)
+      if (match(&R, m_Intrinsic<Intrinsic::vp_load_ff>(m_VPValue(), m_VPValue(),
+                                                       m_VPValue()))) {
+        assert(!LastFFLoad && "Only one FFLoad is supported");
+        LastFFLoad = cast<VPWidenIntrinsicRecipe>(&R);
+      }
+
+  // Skip if no FFLoad.
+  if (!LastFFLoad)
+    return;
+
+  // Ensure FFLoad does not read past the remainder in the last iteration.
+  // Set AVL to min(VF, remainder).
+  VPBuilder Builder(Header, Header->getFirstNonPhi());
+  DebugLoc DL = LastFFLoad->getDebugLoc();
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
+  VPValue *Remainder = Builder.createNaryOp(
+      Instruction::Sub, {&Plan.getVectorTripCount(), CanonicalIVPHI}, DL);
+  VPValue *Cmp =
+      Builder.createICmp(CmpInst::ICMP_ULE, &Plan.getVF(), Remainder, DL);
+  VPValue *AVL = Builder.createSelect(Cmp, &Plan.getVF(), Remainder, DL);
+  Type *CanIVTy = CanonicalIVPHI->getScalarType();
+  Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
+  AVL = Builder.createScalarZExtOrTrunc(AVL, I32Ty, CanIVTy, DL);
+  LastFFLoad->setOperand(2, AVL);
+
+  // To prevent branch-on-poison, mask the early-exit condition with
+  // active-lane-mask. Expected pattern here is:
+  // Before:
+  //   EMIT vp<%alt.exit.cond> = any-of vp<%cond>
+  //   EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond>
+  //   EMIT branch-on-cond vp<%exit.cond>
+  // After:
+  //   EMIT vp<%faulting.lane> = extractvalue vp<%ffload>, 1
+  //   EMIT vp<%alm> = active lane mask 0, vp<%faulting.lane>
+  //   EMIT vp<%and> = logical-and vp<%alm>, vp<%cond>
+  //   EMIT vp<%alt.exit.cond> = any-of vp<%and>
+  //   EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond>
+  //   EMIT branch-on-cond vp<%exit.cond>
+  auto *ExitingLatch =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getExiting());
+  auto *LatchExitingBr = cast<VPInstruction>(ExitingLatch->getTerminator());
+
+  VPValue *VPAnyOf = nullptr;
+  VPValue *VecOp = nullptr;
+  [[maybe_unused]] bool IsExitingOnAnyOfOr =
+      match(LatchExitingBr,
+            m_BranchOnCond(m_BinaryOr(m_VPValue(VPAnyOf), m_VPValue()))) &&
+      match(VPAnyOf, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(VecOp)));
+  assert(IsExitingOnAnyOfOr &&
+         "unexpected exiting sequence in early exit loop");
+
+  // Creates the VPValue for the index of the faulting lane.
+  VPRecipeBase *AnyOfR = VPAnyOf->getDefiningRecipe();
+  Builder.setInsertPoint(cast<VPRecipeBase>(*LastFFLoad->user_begin()));
+  VPValue *One = Plan.getConstantInt(32, 1);
+  VPValue *FaultingLane = Builder.createNaryOp(
+      VPInstruction::ExtractScalarValue, {LastFFLoad, One}, DL);
+  FaultingLane =
+      Builder.createScalarZExtOrTrunc(FaultingLane, CanIVTy, I32Ty, DL);
+  VPValue *ALMMultiplier = Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, 1));
+  Builder.setInsertPoint(AnyOfR);
+  DL = AnyOfR->getDebugLoc();
+  auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, 0));
+  auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
+                                   {Zero, FaultingLane, ALMMultiplier}, DL);
+  auto *R = Builder.createNaryOp(VPInstruction::LogicalAnd, {ALM, VecOp}, DL);
+  AnyOfR->setOperand(0, R);
+
+  // Using FirstActiveLane in the early-exit block is safe,
+  // exiting conditions guarantees at least one valid lane precedes
+  // any poisoned lanes.
+}
+
+void VPlanTransforms::convertFFLoadEarlyExitToVLStepping(VPlan &Plan) {
+  using namespace SCEVPatternMatch;
+  // Find loop header by locating FFLoad.
+  VPWidenIntrinsicRecipe *LastFFLoad = nullptr;
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getEntry())))
+    for (VPRecipeBase &R : *VPBB)
+      if (match(&R, m_Intrinsic<Intrinsic::vp_load_ff>(m_VPValue(), m_VPValue(),
+                                                       m_VPValue()))) {
+        assert(!LastFFLoad && "Only one FFLoad is supported");
+        LastFFLoad = cast<VPWidenIntrinsicRecipe>(&R);
+      }
+
+  // Skip if no FFLoad.
+  if (!LastFFLoad)
+    return;
+
+  VPBasicBlock *HeaderVPBB = LastFFLoad->getParent();
+  // Replace IVStep (VFxUF) with returned faultnig lane from FFLoad.
+  auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
+  VPValue *Backedge = CanonicalIV->getIncomingValue(1);
+  assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
+                                 m_Specific(&Plan.getVFxUF()))) &&
+         "Unexpected canonical iv");
+  VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
+  // Expected pattern
+  //   EMIT vp<%alm> = active lane mask 0, vp<%faulting.lane>
+  //   EMIT vp<%and> = logical-and vp<%alm>, vp<%cond>
+  //   EMIT vp<%alt.exit.cond> = any-of vp<%and>
+  //   EMIT vp<%exit.cond> = or vp<%alt.exit.cond>, vp<%main.exit.cond>
+  //   EMIT branch-on-cond vp<%exit.cond>
+  // use the index to step the iv
+  VPBasicBlock *LatchExiting =
+      HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
+  auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
+  VPValue *VPAnyOf = nullptr;
+  VPValue *FaultingLane = nullptr;
+  [[maybe_unused]] bool IsExitingOnAnyOfOr =
+      match(LatchExitingBr,
+            m_BranchOnCond(m_BinaryOr(m_VPValue(VPAnyOf), m_VPValue()))) &&
+      match(VPAnyOf,
+            m_VPInstruction<VPInstruction::AnyOf>(
+                m_VPInstruction<VPInstruction::LogicalAnd>(
+                    m_VPInstruction<VPInstruction::ActiveLaneMask>(
+                        m_ZeroInt(), m_VPValue(FaultingLane), m_VPValue()),
+                    m_VPValue())));
+
+  CanonicalIVIncrement->setOperand(1, FaultingLane);
----------------
lukel97 wrote:

I don't think this is going to play well with EVL tail folding because we'll now have two different transforms trying to convert the plan to variable stepping, convertFFLoadEarlyExitToVLStepping and transformRecipestoEVLRecipes.

At a high level I wonder if we even want to support vp.load.ff without EVL tail folding to begin with. This PR from what I understand is kind of reimplementing a weaker version of EVL tail folding, since the variable stepping is a hard requirement of the vp.load.ff intrinsic that we can't avoid. It can reduce the number of lanes read for any reason.

I understand that this is supposed to be an incremental PR, but I think maybe a better roadmap might be to start by supporting early exit loops with tail folding. I think this means we need to address the "variable header mask" TODO here:

```c++
bool LoopVectorizationLegality::canFoldTailByMasking() const {
  // The only loops we can vectorize without a scalar epilogue, are loops with
  // a bottom-test and a single exiting block. We'd have to handle the fact
  // that not every instruction executes on the last iteration.  This will
  // require a lane mask which varies through the vector loop body.  (TODO)
  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
    LLVM_DEBUG(
        dbgs()
        << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
    return false;
  }

```

I think we can do this if we replace the notion of a header mask with the notion of a per-block header mask.

https://github.com/llvm/llvm-project/pull/151300