[llvm] [VPlan] Move tail folding out of VPlanPredicator. NFC (PR #176143)

Mon Mar 2 07:48:41 PST 2026

https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/176143

>From 8e7238ebcd31161d2272a669dd394dfb6c42c3d5 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 15 Jan 2026 20:10:57 +0800
Subject: [PATCH 01/24] [VPlan] Move tail folding out of VPlanPredicator. NFC

Currently the logic for introducing a header mask and predicating the vector loop region is done inside introduceMasksAndLinearize.

This splits the tail folding part out into an individual VPlan transform so that VPlanPredicator.cpp doesn't need to worry about tail folding, which seemed to be a temporary measure according to a comment in VPlanTransforms.h.

To perform tail folding independently, this splits the "body" of the vector loop region between the phis in the header and the branch + iv increment in the latch:

Before:

    +-------------------------------------------+
    |%iv = ...                                  |
    |...                                        |
    |%iv.next = add %iv, vfxuf                  |
    |branch-on-count %iv.next, vector-trip-count|
    +-------------------------------------------+

After:

    +-------------------------------------------+
    |%iv = ...                                  |
    |%wide.iv = widen-canonical-iv ...          |
    |%header-mask = icmp ult %wide.iv, BTC      |---+
    |branch-on-cond %header-mask                |   |
    +-------------------------------------------+   |
                         |                          |
                         v                          |
    +-------------------------------------------+   |
    |...                                        |   |
    +-------------------------------------------+   |
                         |                          |
                         v                          |
    +-------------------------------------------+   |
    |%iv.next = add %iv, vfxuf                  |<--+
    |branch-on-count %iv.next, vector-trip-count|
    +-------------------------------------------+

The motivation for this is to align tail folding predication with early-exit predication in #172454. This will also allow us to directly emit an EVL based header mask, instead of having to match + transform the existing header mask in addExplicitVectorLength.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  9 ++-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  5 ++
 .../Vectorize/VPlanConstruction.cpp           | 55 +++++++++++++++
 .../Transforms/Vectorize/VPlanPredicator.cpp  | 67 +------------------
 .../Transforms/Vectorize/VPlanTransforms.h    | 42 ++++++++++--
 5 files changed, 105 insertions(+), 73 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3f1e12e5d1cd0..1dcd33828445a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8463,11 +8463,13 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     InterleaveGroups.insert(IG);
   }
 
+  if (CM.foldTailByMasking())
+    VPlanTransforms::foldTailByMasking(*Plan);
+
   // ---------------------------------------------------------------------------
   // Predicate and linearize the top-level loop region.
   // ---------------------------------------------------------------------------
-  auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
-      *Plan, CM.foldTailByMasking());
+  auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(*Plan);
 
   // ---------------------------------------------------------------------------
   // Construct wide recipes and apply predication for original scalar
@@ -8726,7 +8728,8 @@ void LoopVectorizationPlanner::addReductionResultComputation(
     auto *RR = dyn_cast<VPReductionRecipe>(OrigExitingVPV->getDefiningRecipe());
     if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
         (!RR || !RR->isPartialReduction())) {
-      VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent());
+      VPValue *Cond = RecipeBuilder.getBlockInMask(
+          cast<VPBasicBlock>(PhiR->getParent()->getSuccessors().front()));
       std::optional<FastMathFlags> FMFs =
           PhiTy->isFloatingPointTy()
               ? std::make_optional(RdxDesc.getFastMathFlags())
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index a6a46e36b397d..d8817e1d481a7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -566,6 +566,11 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
   auto *SplitBlock = getPlan()->createVPBasicBlock(getName() + ".split");
   VPBlockUtils::insertBlockAfter(SplitBlock, this);
 
+  // If this is the exiting block, make the split the new exiting block.
+  auto *ParentRegion = getParent();
+  if (ParentRegion && ParentRegion->getExiting() == this)
+    ParentRegion->setExiting(SplitBlock);
+
   // Finally, move the recipes starting at SplitAt to new block.
   for (VPRecipeBase &ToMove :
        make_early_inc_range(make_range(SplitAt, this->end())))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 7ee133545eeb9..97ad127658320 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -958,6 +958,61 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
   TopRegion->getEntryBasicBlock()->setName("vector.body");
 }
 
+void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
+  assert(Plan.getExitBlocks().size() == 1 &&
+         "only a single-exit block is supported currently");
+  assert(Plan.getExitBlocks().front()->getSinglePredecessor() ==
+             Plan.getMiddleBlock() &&
+         "the exit block must have middle block as single predecessor");
+
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
+
+  Header->splitAt(Header->getFirstNonPhi());
+
+  // Create the header mask, insert it in the header and branch on it.
+  auto *IV =
+      new VPWidenCanonicalIVRecipe(Header->getParent()->getCanonicalIV());
+  VPBuilder Builder(Header, Header->getFirstNonPhi());
+  Builder.insert(IV);
+  VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+  VPValue *HeaderMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
+  Builder.createNaryOp(VPInstruction::BranchOnCond, HeaderMask);
+
+  VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
+  VPValue *IVInc;
+  [[maybe_unused]] bool TermBranchOnCount =
+      match(Latch->getTerminator(),
+            m_BranchOnCount(m_VPValue(IVInc),
+                            m_Specific(&Plan.getVectorTripCount())));
+  assert(TermBranchOnCount &&
+         match(IVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
+                            m_Specific(&Plan.getVFxUF()))) &&
+         "Unexpected terminator");
+
+  // Split the latch at the IV update, and branch to it from the header mask.
+  VPBasicBlock *LatchSplit =
+      Latch->splitAt(IVInc->getDefiningRecipe()->getIterator());
+  VPBlockUtils::connectBlocks(Header, LatchSplit);
+
+  // Any extract of the last element must be updated to extract from the last
+  // active lane of the header mask instead (i.e., the lane corresponding to the
+  // last active iteration).
+  Builder.setInsertPoint(Plan.getMiddleBlock()->getTerminator());
+  for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
+    VPValue *Op;
+    if (!match(&R, m_ExtractLastLane(m_ExtractLastPart(m_VPValue(Op)))))
+      continue;
+
+    // Compute the index of the last active lane.
+    VPValue *LastActiveLane =
+        Builder.createNaryOp(VPInstruction::LastActiveLane, HeaderMask);
+    auto *Ext =
+        Builder.createNaryOp(VPInstruction::ExtractLane, {LastActiveLane, Op});
+    R.getVPSingleValue()->replaceAllUsesWith(Ext);
+  }
+}
+
 // Likelyhood of bypassing the vectorized loop due to a runtime check block,
 // including memory overlap checks block and wrapping/unit-stride checks block.
 static constexpr uint32_t CheckBypassWeights[] = {1, 127};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index f7e7fc29bc203..96bee083875af 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -14,13 +14,11 @@
 #include "VPRecipeBuilder.h"
 #include "VPlan.h"
 #include "VPlanCFG.h"
-#include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanUtils.h"
 #include "llvm/ADT/PostOrderIterator.h"
 
 using namespace llvm;
-using namespace VPlanPatternMatch;
 
 namespace {
 class VPPredicator {
@@ -73,9 +71,6 @@ class VPPredicator {
     return EdgeMaskCache.lookup({Src, Dst});
   }
 
-  /// Compute and return the mask for the vector loop header block.
-  void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail);
-
   /// Compute and return the predicate of \p VPBB, assuming that the header
   /// block of the loop is set to True, or to the loop mask when tail folding.
   VPValue *createBlockInMask(VPBasicBlock *VPBB);
@@ -156,28 +151,6 @@ VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
   return BlockMask;
 }
 
-void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) {
-  if (!FoldTail) {
-    setBlockInMask(HeaderVPBB, nullptr);
-    return;
-  }
-
-  // Introduce the early-exit compare IV <= BTC to form header block mask.
-  // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
-  // constructing the desired canonical IV in the header block as its first
-  // non-phi instructions.
-
-  auto &Plan = *HeaderVPBB->getPlan();
-  auto *IV =
-      new VPWidenCanonicalIVRecipe(HeaderVPBB->getParent()->getCanonicalIV());
-  Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
-  Builder.insert(IV);
-
-  VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
-  VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
-  setBlockInMask(HeaderVPBB, BlockMask);
-}
-
 void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) {
   VPBasicBlock *Src = SI->getParent();
 
@@ -232,7 +205,8 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) {
 void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
   SmallVector<VPPhi *> Phis;
   for (VPRecipeBase &R : VPBB->phis())
-    Phis.push_back(cast<VPPhi>(&R));
+    if (auto *PhiR = dyn_cast<VPPhi>(&R))
+      Phis.push_back(PhiR);
   for (VPPhi *PhiR : Phis) {
     // The non-header Phi is converted into a Blend recipe below,
     // so we don't have to worry about the insertion order and we can just use
@@ -261,7 +235,7 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
 }
 
 DenseMap<VPBasicBlock *, VPValue *>
-VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
+VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
@@ -272,14 +246,6 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
   for (VPBlockBase *VPB : RPOT) {
     // Non-outer regions with VPBBs only are supported at the moment.
     auto *VPBB = cast<VPBasicBlock>(VPB);
-    // Introduce the mask for VPBB, which may introduce needed edge masks, and
-    // convert all phi recipes of VPBB to blend recipes unless VPBB is the
-    // header.
-    if (VPBB == Header) {
-      Predicator.createHeaderMask(Header, FoldTail);
-      continue;
-    }
-
     Predicator.createBlockInMask(VPBB);
     Predicator.convertPhisToBlends(VPBB);
   }
@@ -300,32 +266,5 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
 
     PrevVPBB = VPBB;
   }
-
-  // If we folded the tail and introduced a header mask, any extract of the
-  // last element must be updated to extract from the last active lane of the
-  // header mask instead (i.e., the lane corresponding to the last active
-  // iteration).
-  if (FoldTail) {
-    assert(Plan.getExitBlocks().size() == 1 &&
-           "only a single-exit block is supported currently");
-    assert(Plan.getExitBlocks().front()->getSinglePredecessor() ==
-               Plan.getMiddleBlock() &&
-           "the exit block must have middle block as single predecessor");
-
-    VPBuilder B(Plan.getMiddleBlock()->getTerminator());
-    for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
-      VPValue *Op;
-      if (!match(&R, m_ExtractLastLane(m_ExtractLastPart(m_VPValue(Op)))))
-        continue;
-
-      // Compute the index of the last active lane.
-      VPValue *HeaderMask = Predicator.getBlockInMask(Header);
-      VPValue *LastActiveLane =
-          B.createNaryOp(VPInstruction::LastActiveLane, HeaderMask);
-      auto *Ext =
-          B.createNaryOp(VPInstruction::ExtractLane, {LastActiveLane, Op});
-      R.getVPSingleValue()->replaceAllUsesWith(Ext);
-    }
-  }
   return Predicator.getBlockMaskCache();
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index e0d09a099647a..e00c94558bf7a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -413,14 +413,44 @@ struct VPlanTransforms {
   static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
                                      TypeSize VectorRegWidth);
 
+  /// Adapts the vector loop region for tail folding by introducing a header
+  /// mask and predicating the region:
+  ///
+  /// Vector loop region before:
+  /// +-------------------------------------------+
+  /// |%iv = ...                                  |
+  /// |...                                        |
+  /// |%iv.next = add %iv, vfxuf                  |
+  /// |branch-on-count %iv.next, vector-trip-count|
+  /// +-------------------------------------------+
+  ///
+  /// Vector loop region after:
+  /// +-------------------------------------------+
+  /// |%iv = ...                                  |
+  /// |%wide.iv = widen-canonical-iv ...          |
+  /// |%header-mask = icmp ult %wide.iv, BTC      |
+  /// |branch-on-cond %header-mask                |---+
+  /// +-------------------------------------------+   |
+  ///                      |                          |
+  ///                      v                          |
+  /// +-------------------------------------------+   |
+  /// |                   ...                     |   |
+  /// +-------------------------------------------+   |
+  ///                      |                          |
+  ///                      v                          |
+  /// +-------------------------------------------+   |
+  /// |%iv.next = add %iv, vfxuf                  |<--+
+  /// |branch-on-count %iv.next, vector-trip-count|
+  /// +-------------------------------------------+
+  ///
+  /// Any VPInstruction::ExtractLastLanes are also updated to extract from the
+  /// last active lane of the header mask.
+  static void foldTailByMasking(VPlan &Plan);
+
   /// Predicate and linearize the control-flow in the only loop region of
-  /// \p Plan. If \p FoldTail is true, create a mask guarding the loop
-  /// header, otherwise use all-true for the header mask. Masks for blocks are
-  /// added to a block-to-mask map which is returned in order to be used later
-  /// for wide recipe construction. This argument is temporary and will be
-  /// removed in the future.
+  /// \p Plan.
   static DenseMap<VPBasicBlock *, VPValue *>
-  introduceMasksAndLinearize(VPlan &Plan, bool FoldTail);
+  introduceMasksAndLinearize(VPlan &Plan);
 
   /// Add branch weight metadata, if the \p Plan's middle block is terminated by
   /// a BranchOnCond recipe.

>From ed18c28e87494515bf398919b68fbb0a0527123d Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 22 Jan 2026 01:13:20 +0800
Subject: [PATCH 02/24] Don't use branch-on-cond, explicitly pass mask in
 densemap to VPPredicator

This preserves SSA
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  6 ++-
 .../Vectorize/VPlanConstruction.cpp           | 23 ++------
 .../Transforms/Vectorize/VPlanPredicator.cpp  | 17 ++++--
 .../Transforms/Vectorize/VPlanTransforms.h    | 53 ++++++-------------
 4 files changed, 40 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1dcd33828445a..26462483b2059 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8463,13 +8463,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     InterleaveGroups.insert(IG);
   }
 
+  DenseMap<VPBasicBlock *, VPValue *> SuccessorMasks;
   if (CM.foldTailByMasking())
-    VPlanTransforms::foldTailByMasking(*Plan);
+    VPlanTransforms::foldTailByMasking(*Plan, SuccessorMasks);
 
   // ---------------------------------------------------------------------------
   // Predicate and linearize the top-level loop region.
   // ---------------------------------------------------------------------------
-  auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(*Plan);
+  auto BlockMaskCache =
+      VPlanTransforms::introduceMasksAndLinearize(*Plan, SuccessorMasks);
 
   // ---------------------------------------------------------------------------
   // Construct wide recipes and apply predication for original scalar
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 97ad127658320..8daef2ba03825 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -958,7 +958,8 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
   TopRegion->getEntryBasicBlock()->setName("vector.body");
 }
 
-void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
+void VPlanTransforms::foldTailByMasking(
+    VPlan &Plan, DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks) {
   assert(Plan.getExitBlocks().size() == 1 &&
          "only a single-exit block is supported currently");
   assert(Plan.getExitBlocks().front()->getSinglePredecessor() ==
@@ -970,30 +971,16 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
 
   Header->splitAt(Header->getFirstNonPhi());
 
-  // Create the header mask, insert it in the header and branch on it.
+  // Create the header mask and insert it in the header.
   auto *IV =
       new VPWidenCanonicalIVRecipe(Header->getParent()->getCanonicalIV());
   VPBuilder Builder(Header, Header->getFirstNonPhi());
   Builder.insert(IV);
   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
   VPValue *HeaderMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
-  Builder.createNaryOp(VPInstruction::BranchOnCond, HeaderMask);
-
-  VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
-  VPValue *IVInc;
-  [[maybe_unused]] bool TermBranchOnCount =
-      match(Latch->getTerminator(),
-            m_BranchOnCount(m_VPValue(IVInc),
-                            m_Specific(&Plan.getVectorTripCount())));
-  assert(TermBranchOnCount &&
-         match(IVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
-                            m_Specific(&Plan.getVFxUF()))) &&
-         "Unexpected terminator");
 
-  // Split the latch at the IV update, and branch to it from the header mask.
-  VPBasicBlock *LatchSplit =
-      Latch->splitAt(IVInc->getDefiningRecipe()->getIterator());
-  VPBlockUtils::connectBlocks(Header, LatchSplit);
+  // Predicate everything after the header mask.
+  SuccessorMasks[Header] = HeaderMask;
 
   // Any extract of the last element must be updated to extract from the last
   // active lane of the header mask instead (i.e., the lane corresponding to the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 96bee083875af..251a3afa4634d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -35,6 +35,10 @@ class VPPredicator {
 
   BlockMaskCacheTy BlockMaskCache;
 
+  /// A map of blocks to masks that should be applied to each of its successors'
+  /// edges.
+  const BlockMaskCacheTy SuccessorMasks;
+
   /// Create an edge mask for every destination of cases and/or default.
   void createSwitchEdgeMasks(VPInstruction *SI);
 
@@ -61,6 +65,9 @@ class VPPredicator {
   }
 
 public:
+  VPPredicator(const BlockMaskCacheTy &SuccessorMasks)
+      : SuccessorMasks(SuccessorMasks) {}
+
   /// Returns the *entry* mask for \p VPBB.
   VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
     return BlockMaskCache.lookup(VPBB);
@@ -92,6 +99,10 @@ VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) {
 
   VPValue *SrcMask = getBlockInMask(Src);
 
+  if (VPValue *SuccessorMask = SuccessorMasks.lookup(Src))
+    SrcMask = SrcMask ? Builder.createLogicalAnd(SrcMask, SuccessorMask)
+                      : SuccessorMask;
+
   // If there's a single successor, there's no terminator recipe.
   if (Src->getNumSuccessors() == 1)
     return setEdgeMask(Src, Dst, SrcMask);
@@ -234,15 +245,15 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
   }
 }
 
-DenseMap<VPBasicBlock *, VPValue *>
-VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan) {
+DenseMap<VPBasicBlock *, VPValue *> VPlanTransforms::introduceMasksAndLinearize(
+    VPlan &Plan, const DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       Header);
-  VPPredicator Predicator;
+  VPPredicator Predicator(SuccessorMasks);
   for (VPBlockBase *VPB : RPOT) {
     // Non-outer regions with VPBBs only are supported at the moment.
     auto *VPBB = cast<VPBasicBlock>(VPB);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index e00c94558bf7a..cf0147e5a223c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -413,44 +413,25 @@ struct VPlanTransforms {
   static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
                                      TypeSize VectorRegWidth);
 
-  /// Adapts the vector loop region for tail folding by introducing a header
-  /// mask and predicating the region:
+  /// Adapts the vector loop region for tail folding by inserting a header mask
+  /// after the first non-phi in the header:
   ///
-  /// Vector loop region before:
-  /// +-------------------------------------------+
-  /// |%iv = ...                                  |
-  /// |...                                        |
-  /// |%iv.next = add %iv, vfxuf                  |
-  /// |branch-on-count %iv.next, vector-trip-count|
-  /// +-------------------------------------------+
+  /// %wide.iv = widen-canonical-iv ...
+  /// %header-mask = icmp ult %wide.iv, BTC
   ///
-  /// Vector loop region after:
-  /// +-------------------------------------------+
-  /// |%iv = ...                                  |
-  /// |%wide.iv = widen-canonical-iv ...          |
-  /// |%header-mask = icmp ult %wide.iv, BTC      |
-  /// |branch-on-cond %header-mask                |---+
-  /// +-------------------------------------------+   |
-  ///                      |                          |
-  ///                      v                          |
-  /// +-------------------------------------------+   |
-  /// |                   ...                     |   |
-  /// +-------------------------------------------+   |
-  ///                      |                          |
-  ///                      v                          |
-  /// +-------------------------------------------+   |
-  /// |%iv.next = add %iv, vfxuf                  |<--+
-  /// |branch-on-count %iv.next, vector-trip-count|
-  /// +-------------------------------------------+
-  ///
-  /// Any VPInstruction::ExtractLastLanes are also updated to extract from the
-  /// last active lane of the header mask.
-  static void foldTailByMasking(VPlan &Plan);
-
-  /// Predicate and linearize the control-flow in the only loop region of
-  /// \p Plan.
-  static DenseMap<VPBasicBlock *, VPValue *>
-  introduceMasksAndLinearize(VPlan &Plan);
+  /// The header is then split at the header mask, and everything afterwards is
+  /// predicated by adding the header mask to \p SuccessorMasks. Any
+  /// VPInstruction::ExtractLastLanes are also updated to extract from the last
+  /// active lane of the header mask.
+  static void
+  foldTailByMasking(VPlan &Plan,
+                    DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks);
+
+  /// Predicate and linearize the control-flow in the only loop region of \p
+  /// Plan. Masks for blocks are added to a block-to-mask map which is returned
+  /// in order to be used later for wide recipe construction.
+  static DenseMap<VPBasicBlock *, VPValue *> introduceMasksAndLinearize(
+      VPlan &Plan, const DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks);
 
   /// Add branch weight metadata, if the \p Plan's middle block is terminated by
   /// a BranchOnCond recipe.

>From 81332b3fa28e95819fe9d67310a45409b15a833e Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 22 Jan 2026 18:42:48 +0800
Subject: [PATCH 03/24] Don't pass state, use
 VPInstruction::PredicateSuccessors

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  6 ++--
 llvm/lib/Transforms/Vectorize/VPlan.h         |  4 +++
 .../Vectorize/VPlanConstruction.cpp           |  6 ++--
 .../Transforms/Vectorize/VPlanPredicator.cpp  | 28 +++++++++----------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  1 +
 .../Transforms/Vectorize/VPlanTransforms.h    | 12 ++++----
 6 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 26462483b2059..1dcd33828445a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8463,15 +8463,13 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     InterleaveGroups.insert(IG);
   }
 
-  DenseMap<VPBasicBlock *, VPValue *> SuccessorMasks;
   if (CM.foldTailByMasking())
-    VPlanTransforms::foldTailByMasking(*Plan, SuccessorMasks);
+    VPlanTransforms::foldTailByMasking(*Plan);
 
   // ---------------------------------------------------------------------------
   // Predicate and linearize the top-level loop region.
   // ---------------------------------------------------------------------------
-  auto BlockMaskCache =
-      VPlanTransforms::introduceMasksAndLinearize(*Plan, SuccessorMasks);
+  auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(*Plan);
 
   // ---------------------------------------------------------------------------
   // Construct wide recipes and apply predication for original scalar
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1d250322ccf63..33884726e0d80 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1193,6 +1193,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     /// active (non-zero) lane in the mask (second operand), or if no lanes
     /// were active in the mask, returns the default value (third operand).
     ExtractLastActive,
+    /// A transient terminator that indicates any successors of the block should
+    /// be predicated with the mask from the first operand. Should be removed
+    /// after introduceMasksAndLinearize.
+    PredicateSuccessors,
 
     /// Returns the value for vscale.
     VScale,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 8daef2ba03825..89507b68d4b0c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -958,8 +958,7 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
   TopRegion->getEntryBasicBlock()->setName("vector.body");
 }
 
-void VPlanTransforms::foldTailByMasking(
-    VPlan &Plan, DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks) {
+void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
   assert(Plan.getExitBlocks().size() == 1 &&
          "only a single-exit block is supported currently");
   assert(Plan.getExitBlocks().front()->getSinglePredecessor() ==
@@ -980,7 +979,8 @@ void VPlanTransforms::foldTailByMasking(
   VPValue *HeaderMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
 
   // Predicate everything after the header mask.
-  SuccessorMasks[Header] = HeaderMask;
+  Builder.setInsertPoint(Header, Header->end());
+  Builder.createNaryOp(VPInstruction::PredicateSuccessors, HeaderMask);
 
   // Any extract of the last element must be updated to extract from the last
   // active lane of the header mask instead (i.e., the lane corresponding to the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 251a3afa4634d..3c51bdea5cb25 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -14,11 +14,13 @@
 #include "VPRecipeBuilder.h"
 #include "VPlan.h"
 #include "VPlanCFG.h"
+#include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanUtils.h"
 #include "llvm/ADT/PostOrderIterator.h"
 
 using namespace llvm;
+using namespace VPlanPatternMatch;
 
 namespace {
 class VPPredicator {
@@ -35,10 +37,6 @@ class VPPredicator {
 
   BlockMaskCacheTy BlockMaskCache;
 
-  /// A map of blocks to masks that should be applied to each of its successors'
-  /// edges.
-  const BlockMaskCacheTy SuccessorMasks;
-
   /// Create an edge mask for every destination of cases and/or default.
   void createSwitchEdgeMasks(VPInstruction *SI);
 
@@ -65,9 +63,6 @@ class VPPredicator {
   }
 
 public:
-  VPPredicator(const BlockMaskCacheTy &SuccessorMasks)
-      : SuccessorMasks(SuccessorMasks) {}
-
   /// Returns the *entry* mask for \p VPBB.
   VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
     return BlockMaskCache.lookup(VPBB);
@@ -78,8 +73,7 @@ class VPPredicator {
     return EdgeMaskCache.lookup({Src, Dst});
   }
 
-  /// Compute and return the predicate of \p VPBB, assuming that the header
-  /// block of the loop is set to True, or to the loop mask when tail folding.
+  /// Compute and return the predicate of \p VPBB.
   VPValue *createBlockInMask(VPBasicBlock *VPBB);
 
   /// Convert phi recipes in \p VPBB to VPBlendRecipes.
@@ -99,9 +93,13 @@ VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) {
 
   VPValue *SrcMask = getBlockInMask(Src);
 
-  if (VPValue *SuccessorMask = SuccessorMasks.lookup(Src))
-    SrcMask = SrcMask ? Builder.createLogicalAnd(SrcMask, SuccessorMask)
-                      : SuccessorMask;
+  VPValue *SuccPred;
+  if (!Src->empty() &&
+      match(&Src->back(), m_VPInstruction<VPInstruction::PredicateSuccessors>(
+                              m_VPValue(SuccPred)))) {
+    Src->back().eraseFromParent();
+    SrcMask = SrcMask ? Builder.createLogicalAnd(SrcMask, SuccPred) : SuccPred;
+  }
 
   // If there's a single successor, there's no terminator recipe.
   if (Src->getNumSuccessors() == 1)
@@ -245,15 +243,15 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
   }
 }
 
-DenseMap<VPBasicBlock *, VPValue *> VPlanTransforms::introduceMasksAndLinearize(
-    VPlan &Plan, const DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks) {
+DenseMap<VPBasicBlock *, VPValue *>
+VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       Header);
-  VPPredicator Predicator(SuccessorMasks);
+  VPPredicator Predicator;
   for (VPBlockBase *VPB : RPOT) {
     // Non-outer regions with VPBBs only are supported at the moment.
     auto *VPBB = cast<VPBasicBlock>(VPB);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0c6100c04f785..4a43255cd7244 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -454,6 +454,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case VPInstruction::ExtractLastPart:
   case VPInstruction::ExtractPenultimateElement:
   case VPInstruction::Not:
+  case VPInstruction::PredicateSuccessors:
   case VPInstruction::ResumeForEpilogue:
   case VPInstruction::Reverse:
   case VPInstruction::Unpack:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index cf0147e5a223c..740420f9e3ad9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -419,19 +419,17 @@ struct VPlanTransforms {
   /// %wide.iv = widen-canonical-iv ...
   /// %header-mask = icmp ult %wide.iv, BTC
   ///
-  /// The header is then split at the header mask, and everything afterwards is
-  /// predicated by adding the header mask to \p SuccessorMasks. Any
+  /// The header is then split at the header mask and successors are predicated
+  /// with VPInstruction::PredicateSuccessors. Any
   /// VPInstruction::ExtractLastLanes are also updated to extract from the last
   /// active lane of the header mask.
-  static void
-  foldTailByMasking(VPlan &Plan,
-                    DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks);
+  static void foldTailByMasking(VPlan &Plan);
 
   /// Predicate and linearize the control-flow in the only loop region of \p
   /// Plan. Masks for blocks are added to a block-to-mask map which is returned
   /// in order to be used later for wide recipe construction.
-  static DenseMap<VPBasicBlock *, VPValue *> introduceMasksAndLinearize(
-      VPlan &Plan, const DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks);
+  static DenseMap<VPBasicBlock *, VPValue *>
+  introduceMasksAndLinearize(VPlan &Plan);
 
   /// Add branch weight metadata, if the \p Plan's middle block is terminated by
   /// a BranchOnCond recipe.

>From 5475f482929c1b2a325b9295b85878785d5ba281 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 23 Jan 2026 19:43:20 +0800
Subject: [PATCH 04/24] Revert "Don't pass state, use
 VPInstruction::PredicateSuccessors"

This reverts commit 81332b3fa28e95819fe9d67310a45409b15a833e.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  6 ++--
 llvm/lib/Transforms/Vectorize/VPlan.h         |  4 ---
 .../Vectorize/VPlanConstruction.cpp           |  6 ++--
 .../Transforms/Vectorize/VPlanPredicator.cpp  | 28 ++++++++++---------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  1 -
 .../Transforms/Vectorize/VPlanTransforms.h    | 12 ++++----
 6 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1dcd33828445a..26462483b2059 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8463,13 +8463,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     InterleaveGroups.insert(IG);
   }
 
+  DenseMap<VPBasicBlock *, VPValue *> SuccessorMasks;
   if (CM.foldTailByMasking())
-    VPlanTransforms::foldTailByMasking(*Plan);
+    VPlanTransforms::foldTailByMasking(*Plan, SuccessorMasks);
 
   // ---------------------------------------------------------------------------
   // Predicate and linearize the top-level loop region.
   // ---------------------------------------------------------------------------
-  auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(*Plan);
+  auto BlockMaskCache =
+      VPlanTransforms::introduceMasksAndLinearize(*Plan, SuccessorMasks);
 
   // ---------------------------------------------------------------------------
   // Construct wide recipes and apply predication for original scalar
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 33884726e0d80..1d250322ccf63 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1193,10 +1193,6 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     /// active (non-zero) lane in the mask (second operand), or if no lanes
     /// were active in the mask, returns the default value (third operand).
     ExtractLastActive,
-    /// A transient terminator that indicates any successors of the block should
-    /// be predicated with the mask from the first operand. Should be removed
-    /// after introduceMasksAndLinearize.
-    PredicateSuccessors,
 
     /// Returns the value for vscale.
     VScale,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 89507b68d4b0c..8daef2ba03825 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -958,7 +958,8 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
   TopRegion->getEntryBasicBlock()->setName("vector.body");
 }
 
-void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
+void VPlanTransforms::foldTailByMasking(
+    VPlan &Plan, DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks) {
   assert(Plan.getExitBlocks().size() == 1 &&
          "only a single-exit block is supported currently");
   assert(Plan.getExitBlocks().front()->getSinglePredecessor() ==
@@ -979,8 +980,7 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
   VPValue *HeaderMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
 
   // Predicate everything after the header mask.
-  Builder.setInsertPoint(Header, Header->end());
-  Builder.createNaryOp(VPInstruction::PredicateSuccessors, HeaderMask);
+  SuccessorMasks[Header] = HeaderMask;
 
   // Any extract of the last element must be updated to extract from the last
   // active lane of the header mask instead (i.e., the lane corresponding to the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 3c51bdea5cb25..251a3afa4634d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -14,13 +14,11 @@
 #include "VPRecipeBuilder.h"
 #include "VPlan.h"
 #include "VPlanCFG.h"
-#include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanUtils.h"
 #include "llvm/ADT/PostOrderIterator.h"
 
 using namespace llvm;
-using namespace VPlanPatternMatch;
 
 namespace {
 class VPPredicator {
@@ -37,6 +35,10 @@ class VPPredicator {
 
   BlockMaskCacheTy BlockMaskCache;
 
+  /// A map of blocks to masks that should be applied to each of its successors'
+  /// edges.
+  const BlockMaskCacheTy SuccessorMasks;
+
   /// Create an edge mask for every destination of cases and/or default.
   void createSwitchEdgeMasks(VPInstruction *SI);
 
@@ -63,6 +65,9 @@ class VPPredicator {
   }
 
 public:
+  VPPredicator(const BlockMaskCacheTy &SuccessorMasks)
+      : SuccessorMasks(SuccessorMasks) {}
+
   /// Returns the *entry* mask for \p VPBB.
   VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
     return BlockMaskCache.lookup(VPBB);
@@ -73,7 +78,8 @@ class VPPredicator {
     return EdgeMaskCache.lookup({Src, Dst});
   }
 
-  /// Compute and return the predicate of \p VPBB.
+  /// Compute and return the predicate of \p VPBB, assuming that the header
+  /// block of the loop is set to True, or to the loop mask when tail folding.
   VPValue *createBlockInMask(VPBasicBlock *VPBB);
 
   /// Convert phi recipes in \p VPBB to VPBlendRecipes.
@@ -93,13 +99,9 @@ VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) {
 
   VPValue *SrcMask = getBlockInMask(Src);
 
-  VPValue *SuccPred;
-  if (!Src->empty() &&
-      match(&Src->back(), m_VPInstruction<VPInstruction::PredicateSuccessors>(
-                              m_VPValue(SuccPred)))) {
-    Src->back().eraseFromParent();
-    SrcMask = SrcMask ? Builder.createLogicalAnd(SrcMask, SuccPred) : SuccPred;
-  }
+  if (VPValue *SuccessorMask = SuccessorMasks.lookup(Src))
+    SrcMask = SrcMask ? Builder.createLogicalAnd(SrcMask, SuccessorMask)
+                      : SuccessorMask;
 
   // If there's a single successor, there's no terminator recipe.
   if (Src->getNumSuccessors() == 1)
@@ -243,15 +245,15 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
   }
 }
 
-DenseMap<VPBasicBlock *, VPValue *>
-VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan) {
+DenseMap<VPBasicBlock *, VPValue *> VPlanTransforms::introduceMasksAndLinearize(
+    VPlan &Plan, const DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       Header);
-  VPPredicator Predicator;
+  VPPredicator Predicator(SuccessorMasks);
   for (VPBlockBase *VPB : RPOT) {
     // Non-outer regions with VPBBs only are supported at the moment.
     auto *VPBB = cast<VPBasicBlock>(VPB);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 4a43255cd7244..0c6100c04f785 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -454,7 +454,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case VPInstruction::ExtractLastPart:
   case VPInstruction::ExtractPenultimateElement:
   case VPInstruction::Not:
-  case VPInstruction::PredicateSuccessors:
   case VPInstruction::ResumeForEpilogue:
   case VPInstruction::Reverse:
   case VPInstruction::Unpack:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 740420f9e3ad9..cf0147e5a223c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -419,17 +419,19 @@ struct VPlanTransforms {
   /// %wide.iv = widen-canonical-iv ...
   /// %header-mask = icmp ult %wide.iv, BTC
   ///
-  /// The header is then split at the header mask and successors are predicated
-  /// with VPInstruction::PredicateSuccessors. Any
+  /// The header is then split at the header mask, and everything afterwards is
+  /// predicated by adding the header mask to \p SuccessorMasks. Any
   /// VPInstruction::ExtractLastLanes are also updated to extract from the last
   /// active lane of the header mask.
-  static void foldTailByMasking(VPlan &Plan);
+  static void
+  foldTailByMasking(VPlan &Plan,
+                    DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks);
 
   /// Predicate and linearize the control-flow in the only loop region of \p
   /// Plan. Masks for blocks are added to a block-to-mask map which is returned
   /// in order to be used later for wide recipe construction.
-  static DenseMap<VPBasicBlock *, VPValue *>
-  introduceMasksAndLinearize(VPlan &Plan);
+  static DenseMap<VPBasicBlock *, VPValue *> introduceMasksAndLinearize(
+      VPlan &Plan, const DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks);
 
   /// Add branch weight metadata, if the \p Plan's middle block is terminated by
   /// a BranchOnCond recipe.

>From d7bb1118b3536973310c829a15049d8866c838c3 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 23 Jan 2026 19:43:23 +0800
Subject: [PATCH 05/24] Revert "Don't use branch-on-cond, explicitly pass mask
 in densemap to VPPredicator"

This reverts commit ed18c28e87494515bf398919b68fbb0a0527123d.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  6 +--
 .../Vectorize/VPlanConstruction.cpp           | 23 ++++++--
 .../Transforms/Vectorize/VPlanPredicator.cpp  | 17 ++----
 .../Transforms/Vectorize/VPlanTransforms.h    | 53 +++++++++++++------
 4 files changed, 59 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 26462483b2059..1dcd33828445a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8463,15 +8463,13 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     InterleaveGroups.insert(IG);
   }
 
-  DenseMap<VPBasicBlock *, VPValue *> SuccessorMasks;
   if (CM.foldTailByMasking())
-    VPlanTransforms::foldTailByMasking(*Plan, SuccessorMasks);
+    VPlanTransforms::foldTailByMasking(*Plan);
 
   // ---------------------------------------------------------------------------
   // Predicate and linearize the top-level loop region.
   // ---------------------------------------------------------------------------
-  auto BlockMaskCache =
-      VPlanTransforms::introduceMasksAndLinearize(*Plan, SuccessorMasks);
+  auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(*Plan);
 
   // ---------------------------------------------------------------------------
   // Construct wide recipes and apply predication for original scalar
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 8daef2ba03825..97ad127658320 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -958,8 +958,7 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
   TopRegion->getEntryBasicBlock()->setName("vector.body");
 }
 
-void VPlanTransforms::foldTailByMasking(
-    VPlan &Plan, DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks) {
+void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
   assert(Plan.getExitBlocks().size() == 1 &&
          "only a single-exit block is supported currently");
   assert(Plan.getExitBlocks().front()->getSinglePredecessor() ==
@@ -971,16 +970,30 @@ void VPlanTransforms::foldTailByMasking(
 
   Header->splitAt(Header->getFirstNonPhi());
 
-  // Create the header mask and insert it in the header.
+  // Create the header mask, insert it in the header and branch on it.
   auto *IV =
       new VPWidenCanonicalIVRecipe(Header->getParent()->getCanonicalIV());
   VPBuilder Builder(Header, Header->getFirstNonPhi());
   Builder.insert(IV);
   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
   VPValue *HeaderMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
+  Builder.createNaryOp(VPInstruction::BranchOnCond, HeaderMask);
+
+  VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
+  VPValue *IVInc;
+  [[maybe_unused]] bool TermBranchOnCount =
+      match(Latch->getTerminator(),
+            m_BranchOnCount(m_VPValue(IVInc),
+                            m_Specific(&Plan.getVectorTripCount())));
+  assert(TermBranchOnCount &&
+         match(IVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
+                            m_Specific(&Plan.getVFxUF()))) &&
+         "Unexpected terminator");
 
-  // Predicate everything after the header mask.
-  SuccessorMasks[Header] = HeaderMask;
+  // Split the latch at the IV update, and branch to it from the header mask.
+  VPBasicBlock *LatchSplit =
+      Latch->splitAt(IVInc->getDefiningRecipe()->getIterator());
+  VPBlockUtils::connectBlocks(Header, LatchSplit);
 
   // Any extract of the last element must be updated to extract from the last
   // active lane of the header mask instead (i.e., the lane corresponding to the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 251a3afa4634d..96bee083875af 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -35,10 +35,6 @@ class VPPredicator {
 
   BlockMaskCacheTy BlockMaskCache;
 
-  /// A map of blocks to masks that should be applied to each of its successors'
-  /// edges.
-  const BlockMaskCacheTy SuccessorMasks;
-
   /// Create an edge mask for every destination of cases and/or default.
   void createSwitchEdgeMasks(VPInstruction *SI);
 
@@ -65,9 +61,6 @@ class VPPredicator {
   }
 
 public:
-  VPPredicator(const BlockMaskCacheTy &SuccessorMasks)
-      : SuccessorMasks(SuccessorMasks) {}
-
   /// Returns the *entry* mask for \p VPBB.
   VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
     return BlockMaskCache.lookup(VPBB);
@@ -99,10 +92,6 @@ VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) {
 
   VPValue *SrcMask = getBlockInMask(Src);
 
-  if (VPValue *SuccessorMask = SuccessorMasks.lookup(Src))
-    SrcMask = SrcMask ? Builder.createLogicalAnd(SrcMask, SuccessorMask)
-                      : SuccessorMask;
-
   // If there's a single successor, there's no terminator recipe.
   if (Src->getNumSuccessors() == 1)
     return setEdgeMask(Src, Dst, SrcMask);
@@ -245,15 +234,15 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
   }
 }
 
-DenseMap<VPBasicBlock *, VPValue *> VPlanTransforms::introduceMasksAndLinearize(
-    VPlan &Plan, const DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks) {
+DenseMap<VPBasicBlock *, VPValue *>
+VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       Header);
-  VPPredicator Predicator(SuccessorMasks);
+  VPPredicator Predicator;
   for (VPBlockBase *VPB : RPOT) {
     // Non-outer regions with VPBBs only are supported at the moment.
     auto *VPBB = cast<VPBasicBlock>(VPB);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index cf0147e5a223c..e00c94558bf7a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -413,25 +413,44 @@ struct VPlanTransforms {
   static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
                                      TypeSize VectorRegWidth);
 
-  /// Adapts the vector loop region for tail folding by inserting a header mask
-  /// after the first non-phi in the header:
+  /// Adapts the vector loop region for tail folding by introducing a header
+  /// mask and predicating the region:
   ///
-  /// %wide.iv = widen-canonical-iv ...
-  /// %header-mask = icmp ult %wide.iv, BTC
+  /// Vector loop region before:
+  /// +-------------------------------------------+
+  /// |%iv = ...                                  |
+  /// |...                                        |
+  /// |%iv.next = add %iv, vfxuf                  |
+  /// |branch-on-count %iv.next, vector-trip-count|
+  /// +-------------------------------------------+
   ///
-  /// The header is then split at the header mask, and everything afterwards is
-  /// predicated by adding the header mask to \p SuccessorMasks. Any
-  /// VPInstruction::ExtractLastLanes are also updated to extract from the last
-  /// active lane of the header mask.
-  static void
-  foldTailByMasking(VPlan &Plan,
-                    DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks);
-
-  /// Predicate and linearize the control-flow in the only loop region of \p
-  /// Plan. Masks for blocks are added to a block-to-mask map which is returned
-  /// in order to be used later for wide recipe construction.
-  static DenseMap<VPBasicBlock *, VPValue *> introduceMasksAndLinearize(
-      VPlan &Plan, const DenseMap<VPBasicBlock *, VPValue *> &SuccessorMasks);
+  /// Vector loop region after:
+  /// +-------------------------------------------+
+  /// |%iv = ...                                  |
+  /// |%wide.iv = widen-canonical-iv ...          |
+  /// |%header-mask = icmp ult %wide.iv, BTC      |
+  /// |branch-on-cond %header-mask                |---+
+  /// +-------------------------------------------+   |
+  ///                      |                          |
+  ///                      v                          |
+  /// +-------------------------------------------+   |
+  /// |                   ...                     |   |
+  /// +-------------------------------------------+   |
+  ///                      |                          |
+  ///                      v                          |
+  /// +-------------------------------------------+   |
+  /// |%iv.next = add %iv, vfxuf                  |<--+
+  /// |branch-on-count %iv.next, vector-trip-count|
+  /// +-------------------------------------------+
+  ///
+  /// Any VPInstruction::ExtractLastLanes are also updated to extract from the
+  /// last active lane of the header mask.
+  static void foldTailByMasking(VPlan &Plan);
+
+  /// Predicate and linearize the control-flow in the only loop region of
+  /// \p Plan.
+  static DenseMap<VPBasicBlock *, VPValue *>
+  introduceMasksAndLinearize(VPlan &Plan);
 
   /// Add branch weight metadata, if the \p Plan's middle block is terminated by
   /// a BranchOnCond recipe.

>From b7842de37323a15a8b50a99ba598f65e1283c4e8 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Sat, 24 Jan 2026 00:03:16 +0800
Subject: [PATCH 06/24] Maintain SSA

---
 .../Vectorize/VPlanConstruction.cpp           | 27 ++++++++++++++++++-
 .../Transforms/Vectorize/VPlanPredicator.cpp  |  8 +++++-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |  8 ++++++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 97ad127658320..10bec7c91af1e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -968,7 +968,7 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
-  Header->splitAt(Header->getFirstNonPhi());
+  VPBasicBlock *HeaderSplit = Header->splitAt(Header->getFirstNonPhi());
 
   // Create the header mask, insert it in the header and branch on it.
   auto *IV =
@@ -995,6 +995,31 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
       Latch->splitAt(IVInc->getDefiningRecipe()->getIterator());
   VPBlockUtils::connectBlocks(Header, LatchSplit);
 
+  // Insert phis for any values used outside of the predicated body.
+  auto NeedsPhi = [&Header, &Plan](VPUser *U) {
+    auto *UR = cast<VPRecipeBase>(U);
+    return UR->getParent() == Header ||
+           UR->getRegion() != Plan.getVectorLoopRegion();
+  };
+
+  VPTypeAnalysis TypeInfo(Plan);
+  Builder.setInsertPoint(LatchSplit, LatchSplit->begin());
+  for (VPBlockBase *VPB : vp_depth_first_shallow(HeaderSplit)) {
+    auto *VPBB = cast<VPBasicBlock>(VPB);
+    if (VPBB == LatchSplit)
+      continue;
+    for (VPRecipeBase &R : *VPBB) {
+      for (VPValue *V : R.definedValues()) {
+        if (!any_of(V->users(), NeedsPhi))
+          continue;
+        VPValue *Poison = Plan.getOrAddLiveIn(
+            PoisonValue::get(V->getUnderlyingValue()->getType()));
+        VPValue *Phi = Builder.createScalarPhi({V, Poison}, {});
+        V->replaceUsesWithIf(Phi, NeedsPhi);
+      }
+    }
+  }
+
   // Any extract of the last element must be updated to extract from the last
   // active lane of the header mask instead (i.e., the lane corresponding to the
   // last active iteration).
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 96bee083875af..b62e06025bbcb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -214,7 +214,13 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
     // be duplications since this is a simple recursive scan, but future
     // optimizations will clean it up.
 
-    if (all_equal(PhiR->incoming_values())) {
+    auto NotPoison = [&](VPValue *V) {
+      // Don't remove poison from phis from the original loop.
+      return PhiR->getUnderlyingValue() || !isa<VPIRValue>(V) ||
+             !isa<PoisonValue>(cast<VPIRValue>(V)->getValue());
+    };
+
+    if (all_equal(make_filter_range(PhiR->incoming_values(), NotPoison))) {
       PhiR->replaceAllUsesWith(PhiR->getIncomingValue(0));
       PhiR->eraseFromParent();
       continue;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index cae0633dbb185..dbdbf1027c0f7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -158,6 +158,14 @@ class LLVM_ABI_FOR_TEST VPValue {
       VPValue *New,
       llvm::function_ref<bool(VPUser &U, unsigned Idx)> ShouldReplace);
 
+  inline void
+  replaceUsesWithIf(VPValue *New,
+                    llvm::function_ref<bool(VPUser *U)> ShouldReplace) {
+    return replaceUsesWithIf(New, [&ShouldReplace](VPUser &U, unsigned) {
+      return ShouldReplace(&U);
+    });
+  }
+
   /// Returns the recipe defining this VPValue or nullptr if it is not defined
   /// by a recipe, i.e. is a live-in.
   VPRecipeBase *getDefiningRecipe();

>From b88529d493568dcba28f81c7b182b427648af326 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Sun, 25 Jan 2026 17:50:25 +0800
Subject: [PATCH 07/24] Remove unused VPTypeAnalysis

---
 llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 10bec7c91af1e..66fe4ba80ab37 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1002,7 +1002,6 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
            UR->getRegion() != Plan.getVectorLoopRegion();
   };
 
-  VPTypeAnalysis TypeInfo(Plan);
   Builder.setInsertPoint(LatchSplit, LatchSplit->begin());
   for (VPBlockBase *VPB : vp_depth_first_shallow(HeaderSplit)) {
     auto *VPBB = cast<VPBasicBlock>(VPB);

>From df47d1e36bfef827997062dffd85ad3ff2f6cd77 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 29 Jan 2026 16:32:11 +0800
Subject: [PATCH 08/24] Assert no other recipes are in new latch

---
 llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 66fe4ba80ab37..ca60a163d5ad4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -988,7 +988,9 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
   assert(TermBranchOnCount &&
          match(IVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
                             m_Specific(&Plan.getVFxUF()))) &&
-         "Unexpected terminator");
+         std::next(IVInc->getDefiningRecipe()->getIterator()) ==
+             Latch->getTerminator()->getIterator() &&
+         "Unexpected canonical iv increment");
 
   // Split the latch at the IV update, and branch to it from the header mask.
   VPBasicBlock *LatchSplit =

>From 8c58ad6236b0f557ba5012a0fba4889bc528752e Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 29 Jan 2026 17:14:23 +0800
Subject: [PATCH 09/24] Iterate through defs instead of uses

---
 .../Vectorize/VPlanConstruction.cpp           | 27 ++++++++-----------
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |  8 ------
 2 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index ca60a163d5ad4..968cfe15da6d4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -968,7 +968,7 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
-  VPBasicBlock *HeaderSplit = Header->splitAt(Header->getFirstNonPhi());
+  Header->splitAt(Header->getFirstNonPhi());
 
   // Create the header mask, insert it in the header and branch on it.
   auto *IV =
@@ -997,26 +997,21 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
       Latch->splitAt(IVInc->getDefiningRecipe()->getIterator());
   VPBlockUtils::connectBlocks(Header, LatchSplit);
 
-  // Insert phis for any values used outside of the predicated body.
-  auto NeedsPhi = [&Header, &Plan](VPUser *U) {
-    auto *UR = cast<VPRecipeBase>(U);
-    return UR->getParent() == Header ||
-           UR->getRegion() != Plan.getVectorLoopRegion();
-  };
-
+  // Insert phis for any values in the predicated body that are used outside.
   Builder.setInsertPoint(LatchSplit, LatchSplit->begin());
-  for (VPBlockBase *VPB : vp_depth_first_shallow(HeaderSplit)) {
-    auto *VPBB = cast<VPBasicBlock>(VPB);
-    if (VPBB == LatchSplit)
-      continue;
+  for (VPBasicBlock *VPBB : {Header, Plan.getMiddleBlock()}) {
     for (VPRecipeBase &R : *VPBB) {
-      for (VPValue *V : R.definedValues()) {
-        if (!any_of(V->users(), NeedsPhi))
+      for (VPValue *V : R.operands()) {
+        VPRecipeBase *VR = V->getDefiningRecipe();
+        if (!VR || !VR->getRegion() || VR->getParent() == LatchSplit ||
+            VR->getParent() == VPBB)
           continue;
+        // TODO: For reduction phis, use phi value instead of poison.
         VPValue *Poison = Plan.getOrAddLiveIn(
             PoisonValue::get(V->getUnderlyingValue()->getType()));
-        VPValue *Phi = Builder.createScalarPhi({V, Poison}, {});
-        V->replaceUsesWithIf(Phi, NeedsPhi);
+        VPInstruction *Phi = Builder.createScalarPhi({V, Poison}, {});
+        V->replaceUsesWithIf(Phi,
+                             [&Phi](VPUser &U, unsigned) { return &U != Phi; });
       }
     }
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index dbdbf1027c0f7..cae0633dbb185 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -158,14 +158,6 @@ class LLVM_ABI_FOR_TEST VPValue {
       VPValue *New,
       llvm::function_ref<bool(VPUser &U, unsigned Idx)> ShouldReplace);
 
-  inline void
-  replaceUsesWithIf(VPValue *New,
-                    llvm::function_ref<bool(VPUser *U)> ShouldReplace) {
-    return replaceUsesWithIf(New, [&ShouldReplace](VPUser &U, unsigned) {
-      return ShouldReplace(&U);
-    });
-  }
-
   /// Returns the recipe defining this VPValue or nullptr if it is not defined
   /// by a recipe, i.e. is a live-in.
   VPRecipeBase *getDefiningRecipe();

>From 46714282fdd31931031e6cc5d6431222512e782d Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 6 Feb 2026 01:53:11 +0800
Subject: [PATCH 10/24] Add asserts

---
 llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 51f8ed2a0983e..f1b9e9cf444f1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -965,13 +965,18 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
 }
 
 void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   assert(Plan.getExitBlocks().size() == 1 &&
          "only a single-exit block is supported currently");
   assert(Plan.getExitBlocks().front()->getSinglePredecessor() ==
              Plan.getMiddleBlock() &&
          "the exit block must have middle block as single predecessor");
+  // TODO: Handle all successors, not just the middle block when supporting
+  // early exits.
+  assert(LoopRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
+         "The vector loop region must have the middle block as its single "
+         "successor for now");
 
-  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
   Header->splitAt(Header->getFirstNonPhi());
@@ -1003,7 +1008,8 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
       Latch->splitAt(IVInc->getDefiningRecipe()->getIterator());
   VPBlockUtils::connectBlocks(Header, LatchSplit);
 
-  // Insert phis for any values in the predicated body that are used outside.
+  // Insert phis for any values in the predicated body used outside. Currently,
+  // this consists of header phis and extracts in the middle block.
   Builder.setInsertPoint(LatchSplit, LatchSplit->begin());
   for (VPBasicBlock *VPBB : {Header, Plan.getMiddleBlock()}) {
     for (VPRecipeBase &R : *VPBB) {
@@ -1012,6 +1018,9 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
         if (!VR || !VR->getRegion() || VR->getParent() == LatchSplit ||
             VR->getParent() == VPBB)
           continue;
+        assert((isa<VPHeaderPHIRecipe>(R) ||
+                match(&R, m_ExtractLastPart(m_Specific(V)))) &&
+               "Unexpected user of value defined inside vector loop region");
         // TODO: For reduction phis, use phi value instead of poison.
         VPValue *Poison = Plan.getOrAddLiveIn(
             PoisonValue::get(V->getUnderlyingValue()->getType()));

>From 881d2861b9c1a17099e3dc964f4408ed762d9e81 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 6 Feb 2026 02:16:29 +0800
Subject: [PATCH 11/24] Undo convertPhisToBlends change

---
 llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index b62e06025bbcb..55959ec43f090 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -205,8 +205,7 @@ void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) {
 void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
   SmallVector<VPPhi *> Phis;
   for (VPRecipeBase &R : VPBB->phis())
-    if (auto *PhiR = dyn_cast<VPPhi>(&R))
-      Phis.push_back(PhiR);
+    Phis.push_back(cast<VPPhi>(&R));
   for (VPPhi *PhiR : Phis) {
     // The non-header Phi is converted into a Blend recipe below,
     // so we don't have to worry about the insertion order and we can just use
@@ -252,6 +251,8 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan) {
   for (VPBlockBase *VPB : RPOT) {
     // Non-outer regions with VPBBs only are supported at the moment.
     auto *VPBB = cast<VPBasicBlock>(VPB);
+    if (VPBB == Header)
+      continue;
     Predicator.createBlockInMask(VPBB);
     Predicator.convertPhisToBlends(VPBB);
   }

>From d705ff80b973eb09eb1ad07407888b05f0abb58d Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 6 Feb 2026 02:20:24 +0800
Subject: [PATCH 12/24] Update diagram with phis

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 25aeaa96df413..22745995348bb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -453,6 +453,7 @@ struct VPlanTransforms {
   ///                      |                          |
   ///                      v                          |
   /// +-------------------------------------------+   |
+  /// |<phis> = phi [..., ...], [poison, header]  |
   /// |%iv.next = add %iv, vfxuf                  |<--+
   /// |branch-on-count %iv.next, vector-trip-count|
   /// +-------------------------------------------+

>From 0b4b3a4926daf44f0a144ffa689d098821579e60 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 6 Feb 2026 20:29:02 +0800
Subject: [PATCH 13/24] Check Header instead of VPBB, move assert closer

---
 .../lib/Transforms/Vectorize/VPlanConstruction.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index e12ebef645a4b..a7e0ea0781168 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -965,18 +965,13 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
 }
 
 void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
-  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   assert(Plan.getExitBlocks().size() == 1 &&
          "only a single-exit block is supported currently");
   assert(Plan.getExitBlocks().front()->getSinglePredecessor() ==
              Plan.getMiddleBlock() &&
          "the exit block must have middle block as single predecessor");
-  // TODO: Handle all successors, not just the middle block when supporting
-  // early exits.
-  assert(LoopRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
-         "The vector loop region must have the middle block as its single "
-         "successor for now");
 
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
   Header->splitAt(Header->getFirstNonPhi());
@@ -1010,13 +1005,18 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
 
   // Insert phis for any values in the predicated body used outside. Currently,
   // this consists of header phis and extracts in the middle block.
+  // TODO: Handle all successors, not just the middle block when supporting
+  // early exits.
+  assert(LoopRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
+         "The vector loop region must have the middle block as its single "
+         "successor for now");
   Builder.setInsertPoint(LatchSplit, LatchSplit->begin());
   for (VPBasicBlock *VPBB : {Header, Plan.getMiddleBlock()}) {
     for (VPRecipeBase &R : *VPBB) {
       for (VPValue *V : R.operands()) {
         VPRecipeBase *VR = V->getDefiningRecipe();
         if (!VR || !VR->getRegion() || VR->getParent() == LatchSplit ||
-            VR->getParent() == VPBB)
+            VR->getParent() == Header)
           continue;
         assert((isa<VPHeaderPHIRecipe>(R) ||
                 match(&R, m_ExtractLastPart(m_Specific(V)))) &&

>From 1ada464909dd156c7750f7e4e07a38242319967e Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 9 Feb 2026 17:15:27 +0800
Subject: [PATCH 14/24] Update TODO

---
 llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 7e1a6f66ab275..a63cc123b6958 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1046,7 +1046,9 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
                               m_VPInstruction<VPInstruction::ExitingIVValue>(),
                               m_ExtractLastPart(m_Specific(V))))) &&
                "Unexpected user of value defined inside vector loop region");
-        // TODO: For reduction phis, use phi value instead of poison.
+        // TODO: For reduction phis, use phi value instead of poison so we can
+        // remove the special casing for tail folding in
+        // LoopVectorizationPlanner::addReductionResultComputation
         VPValue *Poison = Plan.getOrAddLiveIn(
             PoisonValue::get(V->getUnderlyingValue()->getType()));
         VPInstruction *Phi = Builder.createScalarPhi({V, Poison}, {});

>From 6ce3660546c3ac4232425a83c195038fdc498f65 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 16 Feb 2026 22:35:03 +0800
Subject: [PATCH 15/24] Add VPlan tests

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   2 +-
 .../LoopVectorize/VPlan/tail-folding.ll       | 173 ++++++++++++++++++
 2 files changed, 174 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7bf569c7f0a06..9d3f76ae53fd4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8229,7 +8229,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   }
 
   if (CM.foldTailByMasking())
-    VPlanTransforms::foldTailByMasking(*Plan);
+    RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::foldTailByMasking, *Plan);
 
   // ---------------------------------------------------------------------------
   // Predicate and linearize the top-level loop region.
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
new file mode 100644
index 0000000000000..4d74b26e5d00d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
@@ -0,0 +1,173 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -p loop-vectorize -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S -vplan-print-after=foldTailByMasking -disable-output 2>&1 | FileCheck %s
+
+define i32 @live_out(ptr noalias %p, i32 %n) {
+; CHECK-LABEL: VPlan for loop in 'live_out'
+; CHECK:  VPlan ' for UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VP0:%[0-9]+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VP1:%[0-9]+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count
+; CHECK-NEXT:  Live-in vp<[[VP3:%[0-9]+]]> = backedge-taken count
+; CHECK-NEXT:  Live-in ir<%n> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[VP4:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VP0]]>
+; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
+; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
+; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
+; CHECK-NEXT:    Successor(s): vector.body.split, vector.body.split.split
+; CHECK-EMPTY:
+; CHECK-NEXT:    vector.body.split:
+; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
+; CHECK-NEXT:      EMIT ir<%x> = load ir<%gep>
+; CHECK-NEXT:      EMIT ir<%y> = add ir<%x>, ir<1>
+; CHECK-NEXT:      EMIT store vp<[[VP8:%[0-9]+]]>, ir<%gep>
+; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
+; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
+; CHECK-NEXT:    Successor(s): vector.body.split.split
+; CHECK-EMPTY:
+; CHECK-NEXT:    vector.body.split.split:
+; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8]]> = phi [ ir<%y>, vector.body.split ], [ ir<poison>, vector.body ]
+; CHECK-NEXT:      EMIT vp<%index.next> = add vp<[[VP4]]>, vp<[[VP1]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<[[VP10:%[0-9]+]]> = extract-last-part vp<[[VP8]]>
+; CHECK-NEXT:    EMIT vp<[[VP11:%[0-9]+]]> = extract-last-lane vp<[[VP10]]>
+; CHECK-NEXT:    EMIT vp<[[VP12:%[0-9]+]]> = last-active-lane vp<[[VP6]]>
+; CHECK-NEXT:    EMIT vp<[[VP13:%[0-9]+]]> = extract-lane vp<[[VP12]]>, vp<[[VP8]]>
+; CHECK-NEXT:    EMIT branch-on-cond ir<true>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %y.lcssa = phi i32 [ %y, %loop ] (extra operand: vp<[[VP13]]> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT-SCALAR vp<[[VP15:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[VP15]]> from scalar.ph)
+; CHECK-NEXT:    IR   %gep = getelementptr i32, ptr %p, i32 %iv
+; CHECK-NEXT:    IR   %x = load i32, ptr %gep, align 4
+; CHECK-NEXT:    IR   %y = add i32 %x, 1
+; CHECK-NEXT:    IR   store i32 %y, ptr %gep, align 4
+; CHECK-NEXT:    IR   %iv.next = add i32 %iv, 1
+; CHECK-NEXT:    IR   %ec = icmp eq i32 %iv.next, %n
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  %gep = getelementptr i32, ptr %p, i32 %iv
+  %x = load i32, ptr %gep
+  %y = add i32 %x, 1
+  store i32 %y, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %y
+}
+
+define i32 @reduction(ptr noalias %p, i32 %n) {
+; CHECK-LABEL: VPlan for loop in 'reduction'
+; CHECK:  VPlan ' for UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VP0:%[0-9]+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VP1:%[0-9]+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count
+; CHECK-NEXT:  Live-in vp<[[VP3:%[0-9]+]]> = backedge-taken count
+; CHECK-NEXT:  Live-in ir<%n> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[VP4:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VP0]]>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%rdx> = phi ir<0>, vp<[[VP8:%[0-9]+]]>
+; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
+; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
+; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
+; CHECK-NEXT:    Successor(s): vector.body.split, vector.body.split.split
+; CHECK-EMPTY:
+; CHECK-NEXT:    vector.body.split:
+; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
+; CHECK-NEXT:      EMIT ir<%x> = load ir<%gep>
+; CHECK-NEXT:      EMIT ir<%rdx.next> = add ir<%rdx>, ir<%x>
+; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
+; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
+; CHECK-NEXT:    Successor(s): vector.body.split.split
+; CHECK-EMPTY:
+; CHECK-NEXT:    vector.body.split.split:
+; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8]]> = phi [ ir<%rdx.next>, vector.body.split ], [ ir<poison>, vector.body ]
+; CHECK-NEXT:      EMIT vp<%index.next> = add vp<[[VP4]]>, vp<[[VP1]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<[[VP10:%[0-9]+]]> = extract-last-part vp<[[VP8]]>
+; CHECK-NEXT:    EMIT vp<[[VP11:%[0-9]+]]> = extract-last-lane vp<[[VP10]]>
+; CHECK-NEXT:    EMIT vp<[[VP12:%[0-9]+]]> = last-active-lane vp<[[VP6]]>
+; CHECK-NEXT:    EMIT vp<[[VP13:%[0-9]+]]> = extract-lane vp<[[VP12]]>, vp<[[VP8]]>
+; CHECK-NEXT:    EMIT branch-on-cond ir<true>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %rdx.next.lcssa = phi i32 [ %rdx.next, %loop ] (extra operand: vp<[[VP13]]> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT-SCALAR vp<[[VP15:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<[[VP16:%[0-9]+]]> = phi [ ir<%rdx>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[VP15]]> from scalar.ph)
+; CHECK-NEXT:    IR   %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ] (extra operand: vp<[[VP16]]> from scalar.ph)
+; CHECK-NEXT:    IR   %gep = getelementptr i32, ptr %p, i32 %iv
+; CHECK-NEXT:    IR   %x = load i32, ptr %gep, align 4
+; CHECK-NEXT:    IR   %rdx.next = add i32 %rdx, %x
+; CHECK-NEXT:    IR   %iv.next = add i32 %iv, 1
+; CHECK-NEXT:    IR   %ec = icmp eq i32 %iv.next, %n
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  %rdx = phi i32 [0, %entry], [%rdx.next, %loop]
+  %gep = getelementptr i32, ptr %p, i32 %iv
+  %x = load i32, ptr %gep
+  %rdx.next = add i32 %rdx, %x
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %rdx.next
+}

>From 315cded6e2599f09ec3c2318a7d98998e3b1de7d Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 17 Feb 2026 14:57:15 +0800
Subject: [PATCH 16/24] Move to just after createLoopRegions, precommit for
 vplan test diff

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  7 +-
 .../LoopVectorize/VPlan/tail-folding.ll       | 64 ++++++-------------
 .../VPlan/vplan-print-after-all.ll            |  1 +
 3 files changed, 23 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9d3f76ae53fd4..64fd39c9325ef 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8176,7 +8176,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
                                   CM.foldTailByMasking());
 
-  VPlanTransforms::createLoopRegions(*Plan);
+  RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::createLoopRegions, *Plan);
+  if (CM.foldTailByMasking())
+    RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::foldTailByMasking, *Plan);
 
   // Don't use getDecisionAndClampRange here, because we don't know the UF
   // so this function is better to be conservative, rather than to split
@@ -8228,9 +8230,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     InterleaveGroups.insert(IG);
   }
 
-  if (CM.foldTailByMasking())
-    RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::foldTailByMasking, *Plan);
-
   // ---------------------------------------------------------------------------
   // Predicate and linearize the top-level loop region.
   // ---------------------------------------------------------------------------
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
index 4d74b26e5d00d..8cebcc75bb81f 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
-; RUN: opt < %s -p loop-vectorize -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S -vplan-print-after=foldTailByMasking -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -p loop-vectorize -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S -vplan-print-after=createLoopRegions -disable-output 2>&1 | FileCheck %s
 
 define i32 @live_out(ptr noalias %p, i32 %n) {
 ; CHECK-LABEL: VPlan for loop in 'live_out'
@@ -7,7 +7,6 @@ define i32 @live_out(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:  Live-in vp<[[VP0:%[0-9]+]]> = VF
 ; CHECK-NEXT:  Live-in vp<[[VP1:%[0-9]+]]> = VF * UF
 ; CHECK-NEXT:  Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count
-; CHECK-NEXT:  Live-in vp<[[VP3:%[0-9]+]]> = backedge-taken count
 ; CHECK-NEXT:  Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<entry>:
@@ -18,48 +17,36 @@ define i32 @live_out(ptr noalias %p, i32 %n) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<[[VP4:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
 ; CHECK-NEXT:      ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VP0]]>
-; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
-; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
-; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
-; CHECK-NEXT:    Successor(s): vector.body.split, vector.body.split.split
-; CHECK-EMPTY:
-; CHECK-NEXT:    vector.body.split:
 ; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
 ; CHECK-NEXT:      EMIT ir<%x> = load ir<%gep>
 ; CHECK-NEXT:      EMIT ir<%y> = add ir<%x>, ir<1>
-; CHECK-NEXT:      EMIT store vp<[[VP8:%[0-9]+]]>, ir<%gep>
+; CHECK-NEXT:      EMIT store ir<%y>, ir<%gep>
 ; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
 ; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
-; CHECK-NEXT:    Successor(s): vector.body.split.split
-; CHECK-EMPTY:
-; CHECK-NEXT:    vector.body.split.split:
-; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8]]> = phi [ ir<%y>, vector.body.split ], [ ir<poison>, vector.body ]
-; CHECK-NEXT:      EMIT vp<%index.next> = add vp<[[VP4]]>, vp<[[VP1]]>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<[[VP10:%[0-9]+]]> = extract-last-part vp<[[VP8]]>
-; CHECK-NEXT:    EMIT vp<[[VP11:%[0-9]+]]> = extract-last-lane vp<[[VP10]]>
-; CHECK-NEXT:    EMIT vp<[[VP12:%[0-9]+]]> = last-active-lane vp<[[VP6]]>
-; CHECK-NEXT:    EMIT vp<[[VP13:%[0-9]+]]> = extract-lane vp<[[VP12]]>, vp<[[VP8]]>
+; CHECK-NEXT:    EMIT vp<[[VP5:%[0-9]+]]> = extract-last-part ir<%y>
+; CHECK-NEXT:    EMIT vp<[[VP6:%[0-9]+]]> = extract-last-lane vp<[[VP5]]>
 ; CHECK-NEXT:    EMIT branch-on-cond ir<true>
 ; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<exit>:
-; CHECK-NEXT:    IR   %y.lcssa = phi i32 [ %y, %loop ] (extra operand: vp<[[VP13]]> from middle.block)
+; CHECK-NEXT:    IR   %y.lcssa = phi i32 [ %y, %loop ] (extra operand: vp<[[VP6]]> from middle.block)
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[VP15:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<[[VP8:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[VP15]]> from scalar.ph)
+; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[VP8]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %gep = getelementptr i32, ptr %p, i32 %iv
 ; CHECK-NEXT:    IR   %x = load i32, ptr %gep, align 4
 ; CHECK-NEXT:    IR   %y = add i32 %x, 1
@@ -92,7 +79,6 @@ define i32 @reduction(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:  Live-in vp<[[VP0:%[0-9]+]]> = VF
 ; CHECK-NEXT:  Live-in vp<[[VP1:%[0-9]+]]> = VF * UF
 ; CHECK-NEXT:  Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count
-; CHECK-NEXT:  Live-in vp<[[VP3:%[0-9]+]]> = backedge-taken count
 ; CHECK-NEXT:  Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<entry>:
@@ -103,50 +89,38 @@ define i32 @reduction(ptr noalias %p, i32 %n) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<[[VP4:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
 ; CHECK-NEXT:      ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VP0]]>
-; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%rdx> = phi ir<0>, vp<[[VP8:%[0-9]+]]>
-; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
-; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
-; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
-; CHECK-NEXT:    Successor(s): vector.body.split, vector.body.split.split
-; CHECK-EMPTY:
-; CHECK-NEXT:    vector.body.split:
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%rdx> = phi ir<0>, ir<%rdx.next>
 ; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
 ; CHECK-NEXT:      EMIT ir<%x> = load ir<%gep>
 ; CHECK-NEXT:      EMIT ir<%rdx.next> = add ir<%rdx>, ir<%x>
 ; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
 ; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
-; CHECK-NEXT:    Successor(s): vector.body.split.split
-; CHECK-EMPTY:
-; CHECK-NEXT:    vector.body.split.split:
-; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8]]> = phi [ ir<%rdx.next>, vector.body.split ], [ ir<poison>, vector.body ]
-; CHECK-NEXT:      EMIT vp<%index.next> = add vp<[[VP4]]>, vp<[[VP1]]>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<[[VP10:%[0-9]+]]> = extract-last-part vp<[[VP8]]>
-; CHECK-NEXT:    EMIT vp<[[VP11:%[0-9]+]]> = extract-last-lane vp<[[VP10]]>
-; CHECK-NEXT:    EMIT vp<[[VP12:%[0-9]+]]> = last-active-lane vp<[[VP6]]>
-; CHECK-NEXT:    EMIT vp<[[VP13:%[0-9]+]]> = extract-lane vp<[[VP12]]>, vp<[[VP8]]>
+; CHECK-NEXT:    EMIT vp<[[VP5:%[0-9]+]]> = extract-last-part ir<%rdx.next>
+; CHECK-NEXT:    EMIT vp<[[VP6:%[0-9]+]]> = extract-last-lane vp<[[VP5]]>
 ; CHECK-NEXT:    EMIT branch-on-cond ir<true>
 ; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<exit>:
-; CHECK-NEXT:    IR   %rdx.next.lcssa = phi i32 [ %rdx.next, %loop ] (extra operand: vp<[[VP13]]> from middle.block)
+; CHECK-NEXT:    IR   %rdx.next.lcssa = phi i32 [ %rdx.next, %loop ] (extra operand: vp<[[VP6]]> from middle.block)
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[VP15:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<[[VP16:%[0-9]+]]> = phi [ ir<%rdx>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<[[VP8:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<[[VP9:%[0-9]+]]> = phi [ ir<%rdx>, middle.block ], [ ir<0>, ir-bb<entry> ]
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[VP15]]> from scalar.ph)
-; CHECK-NEXT:    IR   %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ] (extra operand: vp<[[VP16]]> from scalar.ph)
+; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[VP8]]> from scalar.ph)
+; CHECK-NEXT:    IR   %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ] (extra operand: vp<[[VP9]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %gep = getelementptr i32, ptr %p, i32 %iv
 ; CHECK-NEXT:    IR   %x = load i32, ptr %gep, align 4
 ; CHECK-NEXT:    IR   %rdx.next = add i32 %rdx, %x
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
index 478fa82ccd218..6f5e35c9feb58 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-print-after-all.ll
@@ -4,6 +4,7 @@
 ; Verify that `-vplan-print-after-all` option works.
 
 ; CHECK: VPlan for loop in 'foo' after printAfterInitialConstruction
+; CHECK: VPlan for loop in 'foo' after VPlanTransforms::createLoopRegions
 ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::clearReductionWrapFlags
 ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::optimizeFindIVReductions
 ; CHECK: VPlan for loop in 'foo' after VPlanTransforms::handleMultiUseReductions

>From d8497efd8b0d169c5d9983e51fb37c36cba3a3eb Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 17 Feb 2026 14:58:34 +0800
Subject: [PATCH 17/24] VPlan test diff

---
 .../LoopVectorize/VPlan/tail-folding.ll       | 64 +++++++++++++------
 1 file changed, 45 insertions(+), 19 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
index 8cebcc75bb81f..711bcd6376a53 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
-; RUN: opt < %s -p loop-vectorize -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S -vplan-print-after=createLoopRegions -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -p loop-vectorize -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S -vplan-print-after=foldTailByMasking -disable-output 2>&1 | FileCheck %s
 
 define i32 @live_out(ptr noalias %p, i32 %n) {
 ; CHECK-LABEL: VPlan for loop in 'live_out'
@@ -7,6 +7,7 @@ define i32 @live_out(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:  Live-in vp<[[VP0:%[0-9]+]]> = VF
 ; CHECK-NEXT:  Live-in vp<[[VP1:%[0-9]+]]> = VF * UF
 ; CHECK-NEXT:  Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count
+; CHECK-NEXT:  Live-in vp<[[VP3:%[0-9]+]]> = backedge-taken count
 ; CHECK-NEXT:  Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<entry>:
@@ -17,36 +18,48 @@ define i32 @live_out(ptr noalias %p, i32 %n) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      EMIT vp<[[VP4:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
 ; CHECK-NEXT:      ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VP0]]>
+; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
+; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
+; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
+; CHECK-NEXT:    Successor(s): vector.body.split, vector.body.split.split
+; CHECK-EMPTY:
+; CHECK-NEXT:    vector.body.split:
 ; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
 ; CHECK-NEXT:      EMIT ir<%x> = load ir<%gep>
 ; CHECK-NEXT:      EMIT ir<%y> = add ir<%x>, ir<1>
-; CHECK-NEXT:      EMIT store ir<%y>, ir<%gep>
+; CHECK-NEXT:      EMIT store vp<[[VP8:%[0-9]+]]>, ir<%gep>
 ; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
 ; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
-; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
+; CHECK-NEXT:    Successor(s): vector.body.split.split
+; CHECK-EMPTY:
+; CHECK-NEXT:    vector.body.split.split:
+; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8]]> = phi [ ir<%y>, vector.body.split ], [ ir<poison>, vector.body ]
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<[[VP5:%[0-9]+]]> = extract-last-part ir<%y>
-; CHECK-NEXT:    EMIT vp<[[VP6:%[0-9]+]]> = extract-last-lane vp<[[VP5]]>
+; CHECK-NEXT:    EMIT vp<[[VP10:%[0-9]+]]> = extract-last-part vp<[[VP8]]>
+; CHECK-NEXT:    EMIT vp<[[VP11:%[0-9]+]]> = extract-last-lane vp<[[VP10]]>
+; CHECK-NEXT:    EMIT vp<[[VP12:%[0-9]+]]> = last-active-lane vp<[[VP6]]>
+; CHECK-NEXT:    EMIT vp<[[VP13:%[0-9]+]]> = extract-lane vp<[[VP12]]>, vp<[[VP8]]>
 ; CHECK-NEXT:    EMIT branch-on-cond ir<true>
 ; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<exit>:
-; CHECK-NEXT:    IR   %y.lcssa = phi i32 [ %y, %loop ] (extra operand: vp<[[VP6]]> from middle.block)
+; CHECK-NEXT:    IR   %y.lcssa = phi i32 [ %y, %loop ] (extra operand: vp<[[VP13]]> from middle.block)
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[VP8:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<[[VP15:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[VP8]]> from scalar.ph)
+; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[VP15]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %gep = getelementptr i32, ptr %p, i32 %iv
 ; CHECK-NEXT:    IR   %x = load i32, ptr %gep, align 4
 ; CHECK-NEXT:    IR   %y = add i32 %x, 1
@@ -79,6 +92,7 @@ define i32 @reduction(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:  Live-in vp<[[VP0:%[0-9]+]]> = VF
 ; CHECK-NEXT:  Live-in vp<[[VP1:%[0-9]+]]> = VF * UF
 ; CHECK-NEXT:  Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count
+; CHECK-NEXT:  Live-in vp<[[VP3:%[0-9]+]]> = backedge-taken count
 ; CHECK-NEXT:  Live-in ir<%n> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<entry>:
@@ -89,38 +103,50 @@ define i32 @reduction(ptr noalias %p, i32 %n) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      EMIT vp<[[VP4:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
 ; CHECK-NEXT:      ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VP0]]>
-; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%rdx> = phi ir<0>, ir<%rdx.next>
+; CHECK-NEXT:      WIDEN-REDUCTION-PHI ir<%rdx> = phi ir<0>, vp<[[VP8:%[0-9]+]]>
+; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
+; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
+; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
+; CHECK-NEXT:    Successor(s): vector.body.split, vector.body.split.split
+; CHECK-EMPTY:
+; CHECK-NEXT:    vector.body.split:
 ; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
 ; CHECK-NEXT:      EMIT ir<%x> = load ir<%gep>
 ; CHECK-NEXT:      EMIT ir<%rdx.next> = add ir<%rdx>, ir<%x>
 ; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
 ; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
-; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
+; CHECK-NEXT:    Successor(s): vector.body.split.split
+; CHECK-EMPTY:
+; CHECK-NEXT:    vector.body.split.split:
+; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8]]> = phi [ ir<%rdx.next>, vector.body.split ], [ ir<poison>, vector.body ]
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<[[VP5:%[0-9]+]]> = extract-last-part ir<%rdx.next>
-; CHECK-NEXT:    EMIT vp<[[VP6:%[0-9]+]]> = extract-last-lane vp<[[VP5]]>
+; CHECK-NEXT:    EMIT vp<[[VP10:%[0-9]+]]> = extract-last-part vp<[[VP8]]>
+; CHECK-NEXT:    EMIT vp<[[VP11:%[0-9]+]]> = extract-last-lane vp<[[VP10]]>
+; CHECK-NEXT:    EMIT vp<[[VP12:%[0-9]+]]> = last-active-lane vp<[[VP6]]>
+; CHECK-NEXT:    EMIT vp<[[VP13:%[0-9]+]]> = extract-lane vp<[[VP12]]>, vp<[[VP8]]>
 ; CHECK-NEXT:    EMIT branch-on-cond ir<true>
 ; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<exit>:
-; CHECK-NEXT:    IR   %rdx.next.lcssa = phi i32 [ %rdx.next, %loop ] (extra operand: vp<[[VP6]]> from middle.block)
+; CHECK-NEXT:    IR   %rdx.next.lcssa = phi i32 [ %rdx.next, %loop ] (extra operand: vp<[[VP13]]> from middle.block)
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[VP8:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<[[VP9:%[0-9]+]]> = phi [ ir<%rdx>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<[[VP15:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<[[VP16:%[0-9]+]]> = phi [ ir<%rdx>, middle.block ], [ ir<0>, ir-bb<entry> ]
 ; CHECK-NEXT:  Successor(s): ir-bb<loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<loop>:
-; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[VP8]]> from scalar.ph)
-; CHECK-NEXT:    IR   %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ] (extra operand: vp<[[VP9]]> from scalar.ph)
+; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[VP15]]> from scalar.ph)
+; CHECK-NEXT:    IR   %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ] (extra operand: vp<[[VP16]]> from scalar.ph)
 ; CHECK-NEXT:    IR   %gep = getelementptr i32, ptr %p, i32 %iv
 ; CHECK-NEXT:    IR   %x = load i32, ptr %gep, align 4
 ; CHECK-NEXT:    IR   %rdx.next = add i32 %rdx, %x

>From 8ef954df2675e9d0f9181ac28a99ed28d9e1f8ac Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 18 Feb 2026 00:34:03 +0800
Subject: [PATCH 18/24] Use VPTypeAnalysis, rework to collect users separately
 and then add phis

---
 .../Vectorize/VPlanConstruction.cpp           | 72 ++++++++++---------
 .../LoopVectorize/VPlan/tail-folding.ll       | 12 ++--
 2 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index e17de0bc35dc2..640232adcfe52 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1002,54 +1002,58 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
   VPValue *HeaderMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
   Builder.createNaryOp(VPInstruction::BranchOnCond, HeaderMask);
 
-  VPBasicBlock *Latch = LoopRegion->getExitingBasicBlock();
+  VPBasicBlock *OrigLatch = LoopRegion->getExitingBasicBlock();
   VPValue *IVInc;
   [[maybe_unused]] bool TermBranchOnCount =
-      match(Latch->getTerminator(),
+      match(OrigLatch->getTerminator(),
             m_BranchOnCount(m_VPValue(IVInc),
                             m_Specific(&Plan.getVectorTripCount())));
   assert(TermBranchOnCount &&
          match(IVInc, m_Add(m_Specific(LoopRegion->getCanonicalIV()),
                             m_Specific(&Plan.getVFxUF()))) &&
          std::next(IVInc->getDefiningRecipe()->getIterator()) ==
-             Latch->getTerminator()->getIterator() &&
+             OrigLatch->getTerminator()->getIterator() &&
          "Unexpected canonical iv increment");
 
   // Split the latch at the IV update, and branch to it from the header mask.
-  VPBasicBlock *LatchSplit =
-      Latch->splitAt(IVInc->getDefiningRecipe()->getIterator());
-  VPBlockUtils::connectBlocks(Header, LatchSplit);
-
-  // Insert phis for any values in the predicated body used outside. Currently,
-  // this consists of header phis and extracts in the middle block.
-  // TODO: Handle all successors, not just the middle block when supporting
-  // early exits.
+  VPBasicBlock *Latch =
+      OrigLatch->splitAt(IVInc->getDefiningRecipe()->getIterator());
+  Latch->setName("latch");
+  VPBlockUtils::connectBlocks(Header, Latch);
+
+  // Collect any values defined in the loop that need a phi. Currently this is
+  // header phi backedges and live outs extracted in the middle block.
+  // TODO: Handle early exits via Plan.getExitBlocks()
   assert(LoopRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
          "The vector loop region must have the middle block as its single "
          "successor for now");
-  Builder.setInsertPoint(LatchSplit, LatchSplit->begin());
-  for (VPBasicBlock *VPBB : {Header, Plan.getMiddleBlock()}) {
-    for (VPRecipeBase &R : *VPBB) {
-      for (VPValue *V : R.operands()) {
-        VPRecipeBase *VR = V->getDefiningRecipe();
-        if (!VR || !VR->getRegion() || VR->getParent() == LatchSplit ||
-            VR->getParent() == Header)
-          continue;
-        assert((isa<VPHeaderPHIRecipe>(R) ||
-                match(&R, m_CombineOr(
-                              m_VPInstruction<VPInstruction::ExitingIVValue>(),
-                              m_ExtractLastPart(m_Specific(V))))) &&
-               "Unexpected user of value defined inside vector loop region");
-        // TODO: For reduction phis, use phi value instead of poison so we can
-        // remove the special casing for tail folding in
-        // LoopVectorizationPlanner::addReductionResultComputation
-        VPValue *Poison = Plan.getOrAddLiveIn(
-            PoisonValue::get(V->getUnderlyingValue()->getType()));
-        VPInstruction *Phi = Builder.createScalarPhi({V, Poison}, {});
-        V->replaceUsesWithIf(Phi,
-                             [&Phi](VPUser &U, unsigned) { return &U != Phi; });
-      }
-    }
+  SmallSetVector<VPValue *, 4> NeedsPhi;
+  for (VPRecipeBase &R : Header->phis())
+    if (auto *Phi = dyn_cast<VPHeaderPHIRecipe>(&R))
+      if (!isa<VPCanonicalIVPHIRecipe>(Phi) && Phi->getNumIncoming() > 1)
+        NeedsPhi.insert(Phi->getBackedgeValue());
+
+  VPValue *V;
+  for (VPRecipeBase &R : *Plan.getMiddleBlock())
+    if (match(&R, m_CombineOr(m_VPInstruction<VPInstruction::ExitingIVValue>(
+                                  m_VPValue(V)),
+                              m_ExtractLastLaneOfLastPart(m_VPValue(V)))))
+      NeedsPhi.insert(V);
+
+  // Insert phis with a poison incoming value for past the end of the tail.
+  Builder.setInsertPoint(Latch, Latch->begin());
+  VPTypeAnalysis TypeInfo(Plan);
+  for (VPValue *V : NeedsPhi) {
+    if (isa<VPIRValue>(V))
+      continue;
+    // TODO: For reduction phis, use phi value instead of poison so we can
+    // remove the special casing for tail folding in
+    // LoopVectorizationPlanner::addReductionResultComputation
+    VPValue *Poison =
+        Plan.getOrAddLiveIn(PoisonValue::get(TypeInfo.inferScalarType(V)));
+    VPInstruction *Phi = Builder.createScalarPhi({V, Poison});
+    V->replaceUsesWithIf(Phi,
+                         [&Phi](VPUser &U, unsigned) { return &U != Phi; });
   }
 
   // Any extract of the last element must be updated to extract from the last
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
index 711bcd6376a53..c0d7183fc4eb6 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
@@ -23,7 +23,7 @@ define i32 @live_out(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
 ; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
 ; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
-; CHECK-NEXT:    Successor(s): vector.body.split, vector.body.split.split
+; CHECK-NEXT:    Successor(s): vector.body.split, latch
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    vector.body.split:
 ; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
@@ -32,9 +32,9 @@ define i32 @live_out(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:      EMIT store vp<[[VP8:%[0-9]+]]>, ir<%gep>
 ; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
 ; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
-; CHECK-NEXT:    Successor(s): vector.body.split.split
+; CHECK-NEXT:    Successor(s): latch
 ; CHECK-EMPTY:
-; CHECK-NEXT:    vector.body.split.split:
+; CHECK-NEXT:    latch:
 ; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8]]> = phi [ ir<%y>, vector.body.split ], [ ir<poison>, vector.body ]
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
@@ -109,7 +109,7 @@ define i32 @reduction(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
 ; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
 ; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
-; CHECK-NEXT:    Successor(s): vector.body.split, vector.body.split.split
+; CHECK-NEXT:    Successor(s): vector.body.split, latch
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    vector.body.split:
 ; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
@@ -117,9 +117,9 @@ define i32 @reduction(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:      EMIT ir<%rdx.next> = add ir<%rdx>, ir<%x>
 ; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
 ; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
-; CHECK-NEXT:    Successor(s): vector.body.split.split
+; CHECK-NEXT:    Successor(s): latch
 ; CHECK-EMPTY:
-; CHECK-NEXT:    vector.body.split.split:
+; CHECK-NEXT:    latch:
 ; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8]]> = phi [ ir<%rdx.next>, vector.body.split ], [ ir<poison>, vector.body ]
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>

>From ae85948d363758041e74dfefe75a8d903dcf595e Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 18 Feb 2026 00:44:05 +0800
Subject: [PATCH 19/24] Add test cases for header with only phis, and live-out
 defined by phi

---
 .../LoopVectorize/VPlan/tail-folding.ll       | 165 ++++++++++++++++++
 1 file changed, 165 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
index c0d7183fc4eb6..f5c46fd3563bb 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
@@ -86,6 +86,171 @@ exit:
   ret i32 %y
 }
 
+define i32 @conditional_live_out(ptr noalias %p, i32 %n, i1 %c) {
+; CHECK-LABEL: VPlan for loop in 'conditional_live_out'
+; CHECK:  VPlan ' for UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VP0:%[0-9]+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VP1:%[0-9]+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count
+; CHECK-NEXT:  Live-in vp<[[VP3:%[0-9]+]]> = backedge-taken count
+; CHECK-NEXT:  Live-in ir<%n> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[VP4:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VP0]]>
+; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
+; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
+; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
+; CHECK-NEXT:    Successor(s): vector.body.split, latch
+; CHECK-EMPTY:
+; CHECK-NEXT:    vector.body.split:
+; CHECK-NEXT:      EMIT branch-on-cond ir<%c>
+; CHECK-NEXT:    Successor(s): if, latch
+; CHECK-EMPTY:
+; CHECK-NEXT:    if:
+; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
+; CHECK-NEXT:      EMIT ir<%x> = load ir<%gep>
+; CHECK-NEXT:      EMIT ir<%y> = add ir<%x>, ir<1>
+; CHECK-NEXT:      EMIT store ir<%y>, ir<%gep>
+; CHECK-NEXT:    Successor(s): latch
+; CHECK-EMPTY:
+; CHECK-NEXT:    latch:
+; CHECK-NEXT:      EMIT-SCALAR ir<%phi> = phi [ ir<%y>, if ], [ ir<0>, vector.body.split ]
+; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
+; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
+; CHECK-NEXT:    Successor(s): latch
+; CHECK-EMPTY:
+; CHECK-NEXT:    latch:
+; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8:%[0-9]+]]> = phi [ ir<%phi>, latch ], [ ir<poison>, vector.body ]
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<[[VP10:%[0-9]+]]> = extract-last-part vp<[[VP8]]>
+; CHECK-NEXT:    EMIT vp<[[VP11:%[0-9]+]]> = extract-last-lane vp<[[VP10]]>
+; CHECK-NEXT:    EMIT vp<[[VP12:%[0-9]+]]> = last-active-lane vp<[[VP6]]>
+; CHECK-NEXT:    EMIT vp<[[VP13:%[0-9]+]]> = extract-lane vp<[[VP12]]>, vp<[[VP8]]>
+; CHECK-NEXT:    EMIT branch-on-cond ir<true>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:    IR   %phi.lcssa = phi i32 [ %phi, %latch ] (extra operand: vp<[[VP13]]> from middle.block)
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT-SCALAR vp<[[VP15:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ] (extra operand: vp<[[VP15]]> from scalar.ph)
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %latch]
+  br i1 %c, label %if, label %latch
+
+if:
+  %gep = getelementptr i32, ptr %p, i32 %iv
+  %x = load i32, ptr %gep
+  %y = add i32 %x, 1
+  store i32 %y, ptr %gep
+  br label %latch
+
+latch:
+  %phi = phi i32 [0, %loop], [%y, %if]
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %phi
+}
+
+define void @header_unconditional_branch(ptr noalias %p, i32 %n) {
+; CHECK-LABEL: VPlan for loop in 'header_unconditional_branch'
+; CHECK:  VPlan ' for UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VP0:%[0-9]+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VP1:%[0-9]+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count
+; CHECK-NEXT:  Live-in vp<[[VP3:%[0-9]+]]> = backedge-taken count
+; CHECK-NEXT:  Live-in ir<%n> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[VP4:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VP0]]>
+; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
+; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
+; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
+; CHECK-NEXT:    Successor(s): vector.body.split, latch
+; CHECK-EMPTY:
+; CHECK-NEXT:    vector.body.split:
+; CHECK-NEXT:    Successor(s): latch
+; CHECK-EMPTY:
+; CHECK-NEXT:    latch:
+; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
+; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
+; CHECK-NEXT:    Successor(s): latch
+; CHECK-EMPTY:
+; CHECK-NEXT:    latch:
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT branch-on-cond ir<true>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT-SCALAR vp<[[VP10:%[0-9]+]]> = phi [ ir<%iv>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<loop>:
+; CHECK-NEXT:    IR   %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ] (extra operand: vp<[[VP10]]> from scalar.ph)
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %latch]
+  br label %latch
+
+latch:
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 define i32 @reduction(ptr noalias %p, i32 %n) {
 ; CHECK-LABEL: VPlan for loop in 'reduction'
 ; CHECK:  VPlan ' for UF>=1' {

>From 23a601060f068d4ce1ea218a5e50e3730b71b968 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 18 Feb 2026 00:52:38 +0800
Subject: [PATCH 20/24] Explicitly check for VPWidenInductionRecipe

---
 llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 640232adcfe52..4b08ba64180de 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1030,7 +1030,7 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
   SmallSetVector<VPValue *, 4> NeedsPhi;
   for (VPRecipeBase &R : Header->phis())
     if (auto *Phi = dyn_cast<VPHeaderPHIRecipe>(&R))
-      if (!isa<VPCanonicalIVPHIRecipe>(Phi) && Phi->getNumIncoming() > 1)
+      if (!isa<VPCanonicalIVPHIRecipe, VPWidenInductionRecipe>(Phi))
         NeedsPhi.insert(Phi->getBackedgeValue());
 
   VPValue *V;

>From 5e1e58515d70c60deacf13d387aa7802387fccdb Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 18 Feb 2026 01:53:41 +0800
Subject: [PATCH 21/24] Fix replacing users inside the loop

This silently just happened to work because these phis are just replaced with the same original value.
---
 .../lib/Transforms/Vectorize/VPlanConstruction.cpp | 14 +++++++-------
 .../Transforms/LoopVectorize/VPlan/tail-folding.ll |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 4b08ba64180de..a3791a7bd36ce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1027,23 +1027,23 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
   assert(LoopRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
          "The vector loop region must have the middle block as its single "
          "successor for now");
-  SmallSetVector<VPValue *, 4> NeedsPhi;
+  DenseMap<VPValue *, SmallVector<VPUser *>> NeedsPhi;
   for (VPRecipeBase &R : Header->phis())
     if (auto *Phi = dyn_cast<VPHeaderPHIRecipe>(&R))
       if (!isa<VPCanonicalIVPHIRecipe, VPWidenInductionRecipe>(Phi))
-        NeedsPhi.insert(Phi->getBackedgeValue());
+        NeedsPhi[Phi->getBackedgeValue()].push_back(&R);
 
   VPValue *V;
   for (VPRecipeBase &R : *Plan.getMiddleBlock())
     if (match(&R, m_CombineOr(m_VPInstruction<VPInstruction::ExitingIVValue>(
                                   m_VPValue(V)),
-                              m_ExtractLastLaneOfLastPart(m_VPValue(V)))))
-      NeedsPhi.insert(V);
+                              m_ExtractLastPart(m_VPValue(V)))))
+      NeedsPhi[V].push_back(&R);
 
   // Insert phis with a poison incoming value for past the end of the tail.
   Builder.setInsertPoint(Latch, Latch->begin());
   VPTypeAnalysis TypeInfo(Plan);
-  for (VPValue *V : NeedsPhi) {
+  for (auto [V, Users] : NeedsPhi) {
     if (isa<VPIRValue>(V))
       continue;
     // TODO: For reduction phis, use phi value instead of poison so we can
@@ -1052,8 +1052,8 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
     VPValue *Poison =
         Plan.getOrAddLiveIn(PoisonValue::get(TypeInfo.inferScalarType(V)));
     VPInstruction *Phi = Builder.createScalarPhi({V, Poison});
-    V->replaceUsesWithIf(Phi,
-                         [&Phi](VPUser &U, unsigned) { return &U != Phi; });
+    for (VPUser *U : Users)
+      U->replaceUsesOfWith(V, Phi);
   }
 
   // Any extract of the last element must be updated to extract from the last
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
index f5c46fd3563bb..50a1d69236420 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
@@ -29,13 +29,13 @@ define i32 @live_out(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
 ; CHECK-NEXT:      EMIT ir<%x> = load ir<%gep>
 ; CHECK-NEXT:      EMIT ir<%y> = add ir<%x>, ir<1>
-; CHECK-NEXT:      EMIT store vp<[[VP8:%[0-9]+]]>, ir<%gep>
+; CHECK-NEXT:      EMIT store ir<%y>, ir<%gep>
 ; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
 ; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
 ; CHECK-NEXT:    Successor(s): latch
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    latch:
-; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8]]> = phi [ ir<%y>, vector.body.split ], [ ir<poison>, vector.body ]
+; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8:%[0-9]+]]> = phi [ ir<%y>, vector.body.split ], [ ir<poison>, vector.body ]
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
 ; CHECK-NEXT:    No successors

>From 4df3ab1120da88beef0b51e8a8b4cf2b83a84e05 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 20 Feb 2026 13:17:56 +0800
Subject: [PATCH 22/24] Use MapVector and cast VPHeaderPHIRecipe

---
 llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index a3791a7bd36ce..1b86ecaf876b5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1027,11 +1027,10 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
   assert(LoopRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
          "The vector loop region must have the middle block as its single "
          "successor for now");
-  DenseMap<VPValue *, SmallVector<VPUser *>> NeedsPhi;
+  MapVector<VPValue *, SmallVector<VPUser *>> NeedsPhi;
   for (VPRecipeBase &R : Header->phis())
-    if (auto *Phi = dyn_cast<VPHeaderPHIRecipe>(&R))
-      if (!isa<VPCanonicalIVPHIRecipe, VPWidenInductionRecipe>(Phi))
-        NeedsPhi[Phi->getBackedgeValue()].push_back(&R);
+    if (!isa<VPCanonicalIVPHIRecipe, VPWidenInductionRecipe>(R))
+      NeedsPhi[cast<VPHeaderPHIRecipe>(R).getBackedgeValue()].push_back(&R);
 
   VPValue *V;
   for (VPRecipeBase &R : *Plan.getMiddleBlock())

>From 355bbb3231d69a51a9367ef8d7e7bfab382126bd Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 2 Mar 2026 17:37:33 +0800
Subject: [PATCH 23/24] Address review comments

---
 .../Vectorize/VPlanConstruction.cpp           | 14 +++++------
 .../Transforms/Vectorize/VPlanTransforms.h    |  2 +-
 .../LoopVectorize/VPlan/tail-folding.ll       | 24 +++++++++----------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 62b5e244ec236..409281ac1b94e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -999,6 +999,9 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
          "the exit block must have middle block as single predecessor");
 
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  assert(LoopRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
+         "The vector loop region must have the middle block as its single "
+         "successor for now");
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
   Header->splitAt(Header->getFirstNonPhi());
@@ -1028,15 +1031,12 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
   // Split the latch at the IV update, and branch to it from the header mask.
   VPBasicBlock *Latch =
       OrigLatch->splitAt(IVInc->getDefiningRecipe()->getIterator());
-  Latch->setName("latch");
+  Latch->setName("vector.latch");
   VPBlockUtils::connectBlocks(Header, Latch);
 
-  // Collect any values defined in the loop that need a phi. Currently this is
-  // header phi backedges and live outs extracted in the middle block.
+  // Collect any values defined in the loop that need a phi. Currently this
+  // includes header phi backedges and live-outs extracted in the middle block.
   // TODO: Handle early exits via Plan.getExitBlocks()
-  assert(LoopRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
-         "The vector loop region must have the middle block as its single "
-         "successor for now");
   MapVector<VPValue *, SmallVector<VPUser *>> NeedsPhi;
   for (VPRecipeBase &R : Header->phis())
     if (!isa<VPCanonicalIVPHIRecipe, VPWidenInductionRecipe>(R))
@@ -1052,7 +1052,7 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
   // Insert phis with a poison incoming value for past the end of the tail.
   Builder.setInsertPoint(Latch, Latch->begin());
   VPTypeAnalysis TypeInfo(Plan);
-  for (auto [V, Users] : NeedsPhi) {
+  for (const auto &[V, Users] : NeedsPhi) {
     if (isa<VPIRValue>(V))
       continue;
     // TODO: For reduction phis, use phi value instead of poison so we can
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 3120f55fe1338..15fde7cebc710 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -450,7 +450,7 @@ struct VPlanTransforms {
   narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);
 
   /// Adapts the vector loop region for tail folding by introducing a header
-  /// mask and predicating the region:
+  /// mask and conditionally executing the content of the region:
   ///
   /// Vector loop region before:
   /// +-------------------------------------------+
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
index 120f5775af015..6f80a678f3a50 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll
@@ -23,7 +23,7 @@ define i32 @live_out(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
 ; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
 ; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
-; CHECK-NEXT:    Successor(s): vector.body.split, latch
+; CHECK-NEXT:    Successor(s): vector.body.split, vector.latch
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    vector.body.split:
 ; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
@@ -32,9 +32,9 @@ define i32 @live_out(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:      EMIT store ir<%y>, ir<%gep>
 ; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
 ; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
-; CHECK-NEXT:    Successor(s): latch
+; CHECK-NEXT:    Successor(s): vector.latch
 ; CHECK-EMPTY:
-; CHECK-NEXT:    latch:
+; CHECK-NEXT:    vector.latch:
 ; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8:%[0-9]+]]> = phi [ ir<%y>, vector.body.split ], [ ir<poison>, vector.body ]
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
@@ -108,7 +108,7 @@ define i32 @conditional_live_out(ptr noalias %p, i32 %n, i1 %c) {
 ; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
 ; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
 ; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
-; CHECK-NEXT:    Successor(s): vector.body.split, latch
+; CHECK-NEXT:    Successor(s): vector.body.split, vector.latch
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    vector.body.split:
 ; CHECK-NEXT:      EMIT branch-on-cond ir<%c>
@@ -125,9 +125,9 @@ define i32 @conditional_live_out(ptr noalias %p, i32 %n, i1 %c) {
 ; CHECK-NEXT:      EMIT-SCALAR ir<%phi> = phi [ ir<%y>, if ], [ ir<0>, vector.body.split ]
 ; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
 ; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
-; CHECK-NEXT:    Successor(s): latch
+; CHECK-NEXT:    Successor(s): vector.latch
 ; CHECK-EMPTY:
-; CHECK-NEXT:    latch:
+; CHECK-NEXT:    vector.latch:
 ; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8:%[0-9]+]]> = phi [ ir<%phi>, latch ], [ ir<poison>, vector.body ]
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
@@ -202,7 +202,7 @@ define void @header_unconditional_branch(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
 ; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
 ; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
-; CHECK-NEXT:    Successor(s): vector.body.split, latch
+; CHECK-NEXT:    Successor(s): vector.body.split, vector.latch
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    vector.body.split:
 ; CHECK-NEXT:    Successor(s): latch
@@ -210,9 +210,9 @@ define void @header_unconditional_branch(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:    latch:
 ; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
 ; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
-; CHECK-NEXT:    Successor(s): latch
+; CHECK-NEXT:    Successor(s): vector.latch
 ; CHECK-EMPTY:
-; CHECK-NEXT:    latch:
+; CHECK-NEXT:    vector.latch:
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
 ; CHECK-NEXT:    No successors
@@ -274,7 +274,7 @@ define i32 @reduction(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
 ; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = icmp ule vp<[[VP5]]>, vp<[[VP3]]>
 ; CHECK-NEXT:      EMIT branch-on-cond vp<[[VP6]]>
-; CHECK-NEXT:    Successor(s): vector.body.split, latch
+; CHECK-NEXT:    Successor(s): vector.body.split, vector.latch
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    vector.body.split:
 ; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
@@ -282,9 +282,9 @@ define i32 @reduction(ptr noalias %p, i32 %n) {
 ; CHECK-NEXT:      EMIT ir<%rdx.next> = add ir<%rdx>, ir<%x>
 ; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
 ; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<%n>
-; CHECK-NEXT:    Successor(s): latch
+; CHECK-NEXT:    Successor(s): vector.latch
 ; CHECK-EMPTY:
-; CHECK-NEXT:    latch:
+; CHECK-NEXT:    vector.latch:
 ; CHECK-NEXT:      EMIT-SCALAR vp<[[VP8]]> = phi [ ir<%rdx.next>, vector.body.split ], [ ir<poison>, vector.body ]
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP4]]>, vp<[[VP1]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>

>From a89f7dc462163b22f22e750f37748f1e36202665 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 2 Mar 2026 23:47:59 +0800
Subject: [PATCH 24/24] Remove other ExitingIVValue match, use
 m_ExtractLastLaneOfLastPart

---
 llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 409281ac1b94e..d47990ef5d1ef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1044,9 +1044,7 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
 
   VPValue *V;
   for (VPRecipeBase &R : *Plan.getMiddleBlock())
-    if (match(&R, m_CombineOr(m_VPInstruction<VPInstruction::ExitingIVValue>(
-                                  m_VPValue(V)),
-                              m_ExtractLastPart(m_VPValue(V)))))
+    if (match(&R, m_ExtractLastPart(m_VPValue(V))))
       NeedsPhi[V].push_back(&R);
 
   // Insert phis with a poison incoming value for past the end of the tail.
@@ -1071,7 +1069,7 @@ void VPlanTransforms::foldTailByMasking(VPlan &Plan) {
   Builder.setInsertPoint(Plan.getMiddleBlock()->getTerminator());
   for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
     VPValue *Op;
-    if (!match(&R, m_ExtractLastLane(m_ExtractLastPart(m_VPValue(Op)))))
+    if (!match(&R, m_ExtractLastLaneOfLastPart(m_VPValue(Op))))
       continue;
 
     // Compute the index of the last active lane.