[llvm] This is a draft for enabling opt-in tail-folding on vectorized epilogue. (PR #181401)

Hassnaa Hamdi via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 13 10:59:49 PST 2026


https://github.com/hassnaaHamdi created https://github.com/llvm/llvm-project/pull/181401

Enable tail-folding on vectorized epilogue, so that we can have:
Vectorized main loop
Tail-folded vectorized epilogue loop.

>From cafcaf890b7dcd3c2073eab826ab713bde5de1b2 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Tue, 27 Jan 2026 11:40:08 +0000
Subject: [PATCH] Epilogue tail folding draft

---
 .../Vectorize/LoopVectorizationLegality.h     |  20 +-
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp |   2 +-
 .../Vectorize/LoopVectorizationLegality.cpp   |   2 +-
 .../Vectorize/LoopVectorizationPlanner.h      |  32 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 457 ++++++++++++++----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  18 +-
 .../Vectorize/VPlanConstruction.cpp           |  11 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   3 +
 .../Transforms/Vectorize/VPlanTransforms.h    |   3 +-
 .../AArch64/sve-epilog-tail-folded-vect.ll    | 393 +++++++++++++++
 10 files changed, 826 insertions(+), 115 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-tail-folded-vect.ll

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index f82fc588639dd..8b39f4d3a2bdd 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -430,9 +430,12 @@ class LoopVectorizationLegality {
     return LAI->getDepChecker().getStoreLoadForwardSafeDistanceInBits();
   }
 
-  /// Returns true if vector representation of the instruction \p I
-  /// requires mask.
-  bool isMaskRequired(const Instruction *I) const {
+  /// Returns true if instruction \p I requires a mask for vectorization.
+  /// This accounts for both control flow masking (conditionally executed
+  /// blocks) and tail-folding masking (predicated loop vectorization).
+  bool isMaskRequired(const Instruction *I, bool TailFolded) const {
+    if (TailFolded)
+      return TailFoldedMaskedOp.contains(I);
     return MaskedOp.contains(I);
   }
 
@@ -709,9 +712,16 @@ class LoopVectorizationLegality {
   AssumptionCache *AC;
 
   /// While vectorizing these instructions we have to generate a
-  /// call to the appropriate masked intrinsic or drop them in case of
-  /// conditional assumes.
+  /// call to the appropriate masked intrinsic or drop them.
+  /// In order to differentiate between control flow introduced at the source
+  /// level and that introduced by the loop vectoriser during tail-folding, we
+  /// keep two lists:
+  /// 1) MaskedOp - instructions that need masking if we are in conditionally
+  ///    executed block.
+  /// 2) TailFoldedMaskedOp - instructions that need masking because of tail-
+  ///    folding.
   SmallPtrSet<const Instruction *, 8> MaskedOp;
+  SmallPtrSet<const Instruction *, 8> TailFoldedMaskedOp;
 
   /// Contains all identified histogram operations, which are sequences of
   /// load -> update -> store instructions where multiple lanes in a vector
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 6472e1771ec73..b0eb4dc961d28 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -152,7 +152,7 @@ void llvm::DeleteDeadBlocks(ArrayRef <BasicBlock *> BBs, DomTreeUpdater *DTU,
 #ifndef NDEBUG
   // Make sure that all predecessors of each dead block is also dead.
   SmallPtrSet<BasicBlock *, 4> Dead(llvm::from_range, BBs);
-  assert(Dead.size() == BBs.size() && "Duplicating blocks?");
+// assert(Dead.size() == BBs.size() && "Duplicating blocks?");
   for (auto *BB : Dead)
     for (BasicBlock *Pred : predecessors(BB))
       assert(Dead.count(Pred) && "All predecessors must be dead!");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index e57e0cf636501..835723362882e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -2156,7 +2156,7 @@ void LoopVectorizationLegality::prepareToFoldTailByMasking() {
   // Mark all blocks for predication, including those that ordinarily do not
   // need predication such as the header block.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    [[maybe_unused]] bool R = blockCanBePredicated(BB, SafePointers, MaskedOp);
+    [[maybe_unused]] bool R = blockCanBePredicated(BB, SafePointers, TailFoldedMaskedOp);
     assert(R && "Must be able to predicate block when tail-folding.");
   }
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 54bb073eb4f81..39f8ffbb2a053 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -496,6 +496,7 @@ class LoopVectorizationPlanner {
 
   /// The profitability analysis.
   LoopVectorizationCostModel &CM;
+  LoopVectorizationCostModel *EpilogueTailFoldedCM;
 
   /// The interleaved access analysis.
   InterleavedAccessInfo &IAI;
@@ -507,6 +508,7 @@ class LoopVectorizationPlanner {
   OptimizationRemarkEmitter *ORE;
 
   SmallVector<VPlanPtr, 4> VPlans;
+  SmallVector<VPlanPtr, 4> EpilogueTailFoldedPlans;
 
   /// Profitable vector factors.
   SmallVector<VectorizationFactor, 8> ProfitableVFs;
@@ -538,7 +540,21 @@ class LoopVectorizationPlanner {
       PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints,
       OptimizationRemarkEmitter *ORE)
       : OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
-        IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {}
+        EpilogueTailFoldedCM(nullptr), IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {}
+
+  void setEpilogueTailFoldingCM(LoopVectorizationCostModel *Cost) {
+    EpilogueTailFoldedCM = Cost;
+  }
+
+  LoopVectorizationCostModel *getEpilogueTailFoldingCM() const {
+    return EpilogueTailFoldedCM;
+  }
+
+  bool isEpilogueTailFolded() const;
+
+  void disableEpilogueTailFolding() {
+    EpilogueTailFoldedCM = nullptr;
+  }
 
   /// Build VPlans for the specified \p UserVF and \p UserIC if they are
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
@@ -551,7 +567,7 @@ class LoopVectorizationPlanner {
 
   /// Return the VPlan for \p VF. At the moment, there is always a single VPlan
   /// for each VF.
-  VPlan &getPlanFor(ElementCount VF) const;
+  VPlan &getPlanFor(ElementCount VF, bool ForEpilogue = false) const;
 
   /// Compute and return the most profitable vectorization factor. Also collect
   /// all profitable VFs in ProfitableVFs.
@@ -586,9 +602,9 @@ class LoopVectorizationPlanner {
 
   /// Look through the existing plans and return true if we have one with
   /// vectorization factor \p VF.
-  bool hasPlanWithVF(ElementCount VF) const {
-    return any_of(VPlans,
-                  [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); });
+  bool hasPlanWithVF(ElementCount VF, bool ForEpilogue = false) const {
+    return any_of((ForEpilogue && isEpilogueTailFolded()) ? EpilogueTailFoldedPlans : VPlans,
+                   [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); });
   }
 
   /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
@@ -648,7 +664,8 @@ class LoopVectorizationPlanner {
   /// set the largest included VF to the maximum VF for which no plan could be
   /// built. Each VPlan is built starting from a copy of \p InitialPlan, which
   /// is a plain CFG VPlan wrapping the original scalar loop.
-  VPlanPtr tryToBuildVPlanWithVPRecipes(VPlanPtr InitialPlan, VFRange &Range,
+  VPlanPtr tryToBuildVPlanWithVPRecipes(LoopVectorizationCostModel *Cost,
+                                        VPlanPtr InitialPlan, VFRange &Range,
                                         LoopVersioning *LVer);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
@@ -660,7 +677,8 @@ class LoopVectorizationPlanner {
   /// ComputeReductionResult depending on the reduction) in
   /// the middle block. Selects are introduced for reductions between the phi
   /// and users outside the vector region when folding the tail.
-  void addReductionResultComputation(VPlanPtr &Plan,
+  void addReductionResultComputation(LoopVectorizationCostModel *Cost,
+                                     VPlanPtr &Plan,
                                      VPRecipeBuilder &RecipeBuilder,
                                      ElementCount MinVF);
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 499c5a31421ed..6755f8a818812 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -207,7 +207,8 @@ namespace PreferPredicateTy {
   enum Option {
     ScalarEpilogue = 0,
     PredicateElseScalarEpilogue,
-    PredicateOrDontVectorize
+    PredicateOrDontVectorize,
+    PredicatedEpilogue
   };
 } // namespace PreferPredicateTy
 
@@ -227,7 +228,11 @@ static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
                          "predicate-dont-vectorize",
                          "prefers tail-folding, don't attempt vectorization if "
-                         "tail-folding fails.")));
+                         "tail-folding fails."),
+              clEnumValN(PreferPredicateTy::PredicatedEpilogue,
+                         "predicated-epilogue",
+                         "prefers predicated vector epilogues, falling back on "
+                         "scalar epilogues if it fails.")));
 
 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
     "force-tail-folding-style", cl::desc("Force the tail folding style"),
@@ -642,10 +647,12 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
       DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
       EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
       GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
-      ElementCount MinProfitableTripCount, unsigned UnrollFactor)
+      ElementCount MinProfitableTripCount, unsigned UnrollFactor,
+      bool isEpilogueTailFolded)
       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
                             UnrollFactor, CM, Checks, Plan),
-        EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {}
+        EPI(EPI), MinProfitableTripCount(MinProfitableTripCount),
+        isEpilogueTailFolded(isEpilogueTailFolded) {}
 
   /// Holds and updates state information required to vectorize the main loop
   /// and its epilogue in two separate passes. This setup helps us avoid
@@ -657,6 +664,7 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
 
 protected:
   ElementCount MinProfitableTripCount;
+  bool isEpilogueTailFolded;
 };
 
 /// A specialized derived class of inner loop vectorizer that performs
@@ -670,10 +678,12 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
                              AssumptionCache *AC,
                              EpilogueLoopVectorizationInfo &EPI,
                              LoopVectorizationCostModel *CM,
-                             GeneratedRTChecks &Check, VPlan &Plan)
+                             GeneratedRTChecks &Check, VPlan &Plan,
+                             bool isEpilogueTailFolded)
       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
                                        Check, Plan, EPI.MainLoopVF,
-                                       EPI.MainLoopVF, EPI.MainLoopUF) {}
+                                       EPI.MainLoopVF, EPI.MainLoopUF,
+                                        isEpilogueTailFolded) {}
   /// Implements the interface for creating a vectorized skeleton using the
   /// *main loop* strategy (i.e., the first pass of VPlan execution).
   BasicBlock *createVectorizedLoopSkeleton() final;
@@ -708,10 +718,12 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
                                  AssumptionCache *AC,
                                  EpilogueLoopVectorizationInfo &EPI,
                                  LoopVectorizationCostModel *CM,
-                                 GeneratedRTChecks &Checks, VPlan &Plan)
+                                 GeneratedRTChecks &Checks, VPlan &Plan,
+                                 bool isEpilogueTailFolded)
       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
                                        Checks, Plan, EPI.EpilogueVF,
-                                       EPI.EpilogueVF, EPI.EpilogueUF) {}
+                                       EPI.EpilogueVF, EPI.EpilogueUF,
+                                       isEpilogueTailFolded) {}
   /// Implements the interface for creating a vectorized skeleton using the
   /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
   BasicBlock *createVectorizedLoopSkeleton() final;
@@ -861,8 +873,8 @@ enum ScalarEpilogueLowering {
   CM_ScalarEpilogueNotAllowedLowTripLoop,
 
   // Loop hint predicate indicating an epilogue is undesired.
-  CM_ScalarEpilogueNotNeededUsePredicate,
-
+  CM_ScalarEpilogueNotNeededUsePredicatedBody,
+  CM_ScalarEpilogueNotNeededUsePredicatedEpilogue,
   // Directive indicating we must either tail fold or not vectorize
   CM_ScalarEpilogueNotAllowedUsePredicate
 };
@@ -966,6 +978,13 @@ class LoopVectorizationCostModel {
     return MinBWs;
   }
 
+  void copyMinimalBitwidths(const MapVector<Instruction *, uint64_t> &BWs) {
+    MinBWs.clear();
+    for (auto &BW : BWs) {
+      MinBWs.insert(BW);
+    }
+  }
+
   /// \returns True if it is more profitable to scalarize instruction \p I for
   /// vectorization factor \p VF.
   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
@@ -1234,6 +1253,10 @@ class LoopVectorizationCostModel {
   /// \p VF is the vectorization factor that will be used to vectorize \p I.
   bool isScalarWithPredication(Instruction *I, ElementCount VF);
 
+  /// Wrapper function for LoopVectorizationLegality::isMaskRequired,
+  /// that passes the Instruction \p I and if we fold tail.
+  bool isMaskRequired(Instruction *I) const;
+
   /// Returns true if \p I is an instruction that needs to be predicated
   /// at runtime.  The result is independent of the predication mechanism.
   /// Superset of instructions that return true for isScalarWithPredication.
@@ -1312,12 +1335,16 @@ class LoopVectorizationCostModel {
   /// Returns true if a scalar epilogue is not allowed due to optsize or a
   /// loop hint annotation.
   bool isScalarEpilogueAllowed() const {
-    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
+    // We may have requested the creation of a predicated vector epilogue, but
+    // the cost model may still decide it's not worth it and should fall back
+    // on a scalar epilogue.
+    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
+           ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicatedEpilogue;
   }
 
   /// Returns true if tail-folding is preferred over a scalar epilogue.
   bool preferPredicatedLoop() const {
-    return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
+    return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicatedBody ||
            ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
   }
 
@@ -1334,6 +1361,8 @@ class LoopVectorizationCostModel {
   /// \param IsScalableVF true if scalable vector factors enabled.
   /// \param UserIC User specific interleave count.
   void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
+    // TODO: Should probably have separate style for the main and epilogue.
+    
     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
     if (!Legal->canFoldTailByMasking()) {
       ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
@@ -1360,7 +1389,7 @@ class LoopVectorizationCostModel {
     // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
     // if it's allowed, or DataWithoutLaneMask otherwise.
     if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
-        ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
+        ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicatedBody)
       ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
     else
       ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
@@ -1375,6 +1404,7 @@ class LoopVectorizationCostModel {
   }
 
   /// Returns true if all loop blocks should be masked to fold tail loop.
+  /// TODO: Distinguish according to predicated body or epilogue.
   bool foldTailByMasking() const {
     // TODO: check if it is possible to check for None style independent of
     // IVUpdateMayOverflow flag in getTailFoldingStyle.
@@ -1405,6 +1435,7 @@ class LoopVectorizationCostModel {
   /// for any reason, e.g. because tail folding now requires a predicate
   /// or because the block in the original loop was predicated.
   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
+    /// TODO: Distinguish according to predicated body or epilogue.
     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
   }
 
@@ -2381,6 +2412,7 @@ Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
   } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
              !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
              Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
+    // TODO: What about check in epilogue?
     // vscale is not necessarily a power-of-2, which means we cannot guarantee
     // an overflow to zero when updating induction variables and so an
     // additional overflow check is required before entering the vector loop.
@@ -2772,6 +2804,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
 
     // If tail-folding is applied, the primary induction variable will be used
     // to feed a vector compare.
+    /// TODO: Distinguish according to predicated body or epilogue.
     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
       continue;
 
@@ -2864,12 +2897,17 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
   }
 }
 
+bool LoopVectorizationCostModel::isMaskRequired(Instruction *I) const {
+  return Legal->isMaskRequired(I, foldTailByMasking());
+}
+
 // TODO: Fold into LoopVectorizationLegality::isMaskRequired.
 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
   // TODO: We can use the loop-preheader as context point here and get
   // context sensitive reasoning for isSafeToSpeculativelyExecute.
+  bool PredicatedLoop = foldTailByMasking();
   if (isSafeToSpeculativelyExecute(I) ||
-      (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
+      (isa<LoadInst, StoreInst, CallInst>(I) && !isMaskRequired(I)) ||
       isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
     return false;
 
@@ -2879,7 +2917,8 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
     return true;
 
   // If we're not folding the tail by masking, predication is unnecessary.
-  if (!foldTailByMasking())
+  /// TODO: Distinguish according to predicated body or epilogue.
+  if (!PredicatedLoop)
     return false;
 
   // All that remain are instructions with side-effects originally executed in
@@ -2894,7 +2933,7 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
   case Instruction::Call:
     // Side-effects of a Call are assumed to be non-invariant, needing a
     // (fold-tail) mask.
-    assert(Legal->isMaskRequired(I) &&
+    assert(isMaskRequired(I) &&
            "should have returned earlier for calls not needing a mask");
     return true;
   case Instruction::Load:
@@ -3041,8 +3080,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   // (either a gap at the end of a load-access that may result in a speculative
   // load, or any gaps in a store-access).
   bool PredicatedAccessRequiresMasking =
-      blockNeedsPredicationForAnyReason(I->getParent()) &&
-      Legal->isMaskRequired(I);
+      blockNeedsPredicationForAnyReason(I->getParent()) && isMaskRequired(I);
   bool LoadAccessWithGapsRequiresEpilogMasking =
       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
       !isScalarEpilogueAllowed();
@@ -3454,6 +3492,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
   return MaxScalableVF;
 }
 
+// TODO: What does FoldTailByMasking mean here? Should it only be true
+// when predicating the main vector body or for the epilogue too?
 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
     unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
     bool FoldTailByMasking) {
@@ -3613,12 +3653,14 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return computeFeasibleMaxVF(MaxTC, UserVF, UserIC, false);
   case CM_ScalarEpilogueNotAllowedUsePredicate:
     [[fallthrough]];
-  case CM_ScalarEpilogueNotNeededUsePredicate:
+  case CM_ScalarEpilogueNotNeededUsePredicatedBody:
     LLVM_DEBUG(
         dbgs() << "LV: vector predicate hint/switch found.\n"
                << "LV: Not allowing scalar epilogue, creating predicated "
                << "vector loop.\n");
     break;
+  case CM_ScalarEpilogueNotNeededUsePredicatedEpilogue:
+    break;
   case CM_ScalarEpilogueNotAllowedLowTripLoop:
     // fallthrough as a special case of OptForSize
   case CM_ScalarEpilogueNotAllowedOptSize:
@@ -3648,9 +3690,10 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     // none were taken so far.
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
   }
-
+  bool TailFoldedIntoMainBody = 
+      ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicatedEpilogue;
   FixedScalableVFPair MaxFactors =
-      computeFeasibleMaxVF(MaxTC, UserVF, UserIC, true);
+      computeFeasibleMaxVF(MaxTC, UserVF, UserIC, TailFoldedIntoMainBody);
 
   // Avoid tail folding if the trip count is known to be a multiple of any VF
   // we choose.
@@ -3723,6 +3766,14 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return FixedScalableVFPair::getNone();
   }
 
+  // We don't want to set the tail-folding style on this cost model when we're
+  // only using predicated vector epilogues.
+  if (!TailFoldedIntoMainBody) {
+    LLVM_DEBUG(dbgs() << "LV: Using unpredicated vector body with predicated "
+                         "vector epilogue.\n");
+    return MaxFactors;
+  }
+
   // If we don't know the precise trip count, or if the trip count that we
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
@@ -3730,6 +3781,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
   setTailFoldingStyles(ContainsScalableVF, UserIC);
   if (foldTailByMasking()) {
+    // TODO: Should probably disallow predicated vector epilogues with EVL.
     if (foldTailWithEVL()) {
       LLVM_DEBUG(
           dbgs()
@@ -3748,7 +3800,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 
   // If there was a tail-folding hint/switch, but we can't fold the tail by
   // masking, fallback to a vectorization with a scalar epilogue.
-  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicatedBody ||
+      ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicatedEpilogue) {
     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
                          "scalar epilogue instead.\n");
     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
@@ -3845,6 +3898,8 @@ ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
   return VF;
 }
 
+// Looks like FoldTailByMasking is mostly useful for a single predicated
+// body.
 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
     ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking) {
@@ -4418,11 +4473,16 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
   // with vectorization factors larger than a certain value.
 
   // Allow the target to opt out entirely.
+  // How does this interact with the option to create predicated vector
+  // epilogues?
   if (!TTI.preferEpilogueVectorization())
     return false;
 
   // We also consider epilogue vectorization unprofitable for targets that don't
   // consider interleaving beneficial (eg. MVE).
+
+  // TODO: This shouldn't be a restriction if the target prefers predicated
+  // vector epilogues.
   if (TTI.getMaxInterleaveFactor(VF) <= 1)
     return false;
 
@@ -4435,11 +4495,15 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     const ElementCount MainLoopVF, unsigned IC) {
   VectorizationFactor Result = VectorizationFactor::Disabled();
+  // How does this interact with forcing predicated vector epilogues?
   if (!EnableEpilogueVectorization) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
     return Result;
   }
 
+  // We should probably avoid creating the predicated plans much earlier if a
+  // scalar epilogue isn't allowed.
+  // Deliberately using unpredicated cost model here - is this right?
   if (!CM.isScalarEpilogueAllowed()) {
     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
                          "epilogue is allowed.\n");
@@ -4457,7 +4521,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
   if (EpilogueVectorizationForceVF > 1) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
-    if (hasPlanWithVF(ForcedEC))
+    if (hasPlanWithVF(ForcedEC, /*ForEpilogue*/ true))
       return {ForcedEC, 0, 0};
 
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
@@ -4471,17 +4535,20 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     return Result;
   }
 
-  if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
+  if (!EpilogueTailFoldedCM &&
+    !CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
                          "this loop\n");
     return Result;
   }
-
   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
   // the main loop handles 8 lanes per iteration. We could still benefit from
   // vectorizing the epilogue loop with VF=4.
+
+  // FIXME: This doesn't consider interleaving, does it matter?
+  unsigned Multiplier = EpilogueTailFoldedCM ? IC : 1;
   ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
-      estimateElementCount(MainLoopVF, CM.getVScaleForTuning()));
+      estimateElementCount(MainLoopVF * Multiplier, CM.getVScaleForTuning()));
 
   Type *TCType = Legal->getWidestInductionType();
   const SCEV *RemainingIterations = nullptr;
@@ -4513,6 +4580,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     return Result;
 
   if (MainLoopVF.isFixed()) {
+    // TODO: extend to support scalable VFs.
     MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
     if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
                             SE.getConstant(TCType, MaxTripCount))) {
@@ -4527,15 +4595,17 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
   };
   for (auto &NextVF : ProfitableVFs) {
     // Skip candidate VFs without a corresponding VPlan.
-    if (!hasPlanWithVF(NextVF.Width))
+    if (!hasPlanWithVF(NextVF.Width,
+                      /*ForEpilogue*/ true))
       continue;
 
     // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
     // vectors) or > the VF of the main loop (fixed vectors).
+    // FIXME: Why are we using isKnownGE for scalable vectors?
     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
         (NextVF.Width.isScalable() &&
-         ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
+        ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
         (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
          ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
       continue;
@@ -4545,6 +4615,12 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     // TODO: We should also consider comparing against a scalable
     // RemainingIterations when SCEV be able to evaluate non-canonical
     // vscale-based expressions.
+
+    // TODO: For predicated vector epilogues it's ok for the epilogue VF
+    // width to be > remaining iterations. In fact, it can be a good
+    // thing if it allows us to remove the loop by eliminating the backedge
+    // branch. We just don't want to make sure we're using enough lanes in
+    // the vector to be worthwhile.
     if (!ScalableRemIter) {
       // Handle the case where NextVF and RemainingIterations are in different
       // numerical spaces.
@@ -4556,9 +4632,14 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
         continue;
     }
 
+    // TODO: We really need to add support for calculating a MaxTripCount
+    // as this will be useful for predicated vector epilogues.
+
+    // TODO: Need to update last flag passed to isMoreProfitable.
     if (Result.Width.isScalar() ||
-        isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking(),
-                         /*IsEpilogue*/ true))
+        isMoreProfitable(NextVF, Result, MaxTripCount,
+                        EpilogueTailFoldedCM ? !EpilogueTailFoldedCM->foldTailByMasking() :
+                        !CM.foldTailByMasking(), /*IsEpilogue*/ true))
       Result = NextVF;
   }
 
@@ -5323,7 +5404,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
          "Stride should be 1 or -1 for consecutive memory access");
   const Align Alignment = getLoadStoreAlignment(I);
   InstructionCost Cost = 0;
-  if (Legal->isMaskRequired(I)) {
+  if (isMaskRequired(I)) {
     unsigned IID = I->getOpcode() == Instruction::Load
                        ? Intrinsic::masked_load
                        : Intrinsic::masked_store;
@@ -5392,8 +5473,8 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
                      : Intrinsic::masked_scatter;
   return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
          TTI.getMemIntrinsicInstrCost(
-             MemIntrinsicCostAttributes(IID, VectorTy, Ptr,
-                                        Legal->isMaskRequired(I), Alignment, I),
+             MemIntrinsicCostAttributes(IID, VectorTy, Ptr, isMaskRequired(I),
+                                        Alignment, I),
              CostKind);
 }
 
@@ -5423,12 +5504,11 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
       (isa<StoreInst>(I) && !Group->isFull());
   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
       InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
-      Group->getAlign(), AS, CostKind, Legal->isMaskRequired(I),
-      UseMaskForGaps);
+      Group->getAlign(), AS, CostKind, isMaskRequired(I), UseMaskForGaps);
 
   if (Group->isReverse()) {
     // TODO: Add support for reversed masked interleaved access.
-    assert(!Legal->isMaskRequired(I) &&
+    assert(!isMaskRequired(I) &&
            "Reverse masked interleaved access not supported.");
     Cost += Group->getNumMembers() *
             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
@@ -5724,6 +5804,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
           // stores.  Note that even with tail folding we know that at least
           // one lane is active (i.e. generalized predication is not possible
           // here), and the logic below depends on this fact.
+          // TODO: Needs to reason about predicated main body vs epilogue.
           if (!foldTailByMasking())
             return true;
 
@@ -5976,7 +6057,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
         continue;
       }
 
-      bool MaskRequired = Legal->isMaskRequired(CI);
+      bool MaskRequired = isMaskRequired(CI);
       // Compute corresponding vector type for return value and arguments.
       Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
       for (Type *ScalarTy : ScalarTys)
@@ -6796,11 +6877,17 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   return VectorizationFactor::Disabled();
 }
 
+bool LoopVectorizationPlanner::isEpilogueTailFolded() const {
+  return EpilogueTailFoldedCM && EpilogueTailFoldedCM->foldTailByMasking();
+}
+
 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
 
+  // computeMaxVF -> computeFeasibleMaxVF -> MinBWs = computeMinimumValueSizes
+  // These need copying to EpilogueTailFoldedCM.
   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
     return;
@@ -6819,12 +6906,48 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.invalidateCostModelingDecisions();
   }
 
-  if (CM.foldTailByMasking())
+  // TODO: Use information from computeMaxVF to know if we should still use a
+  // predicated epilogue.
+  if (EpilogueTailFoldedCM) {
+    //    dbgs() << "Preparing additional predicated version of cost model\n";
+    EpilogueTailFoldedCM->collectValuesToIgnore();
+    EpilogueTailFoldedCM->collectElementTypesForWidening();
+    EpilogueTailFoldedCM->setTailFoldingStyles(MaxFactors.ScalableVF.isNonZero(), UserIC);
+    // If the max VF is likely to be 2, then there probably isn't much point
+    // generating a predicated vector epilogue.
+    unsigned EstimatedMaxVF = MaxFactors.FixedVF.getFixedValue();
+    if (MaxFactors.ScalableVF.isNonZero()) {
+      unsigned EstimatedMaxScalableVF = estimateElementCount(MaxFactors.ScalableVF, CM.getVScaleForTuning());
+      EstimatedMaxVF = std::max(EstimatedMaxVF, EstimatedMaxScalableVF);
+    }
+    if (EpilogueTailFoldedCM->foldTailByMasking() && !EpilogueTailFoldedCM->InterleaveInfo.hasGroups() && EstimatedMaxVF > 2) {
+      EpilogueTailFoldedCM->copyMinimalBitwidths(CM.getMinimalBitwidths());
+
+      // Invalidate interleave groups if all blocks of loop will be predicated.
+      if (EpilogueTailFoldedCM->blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
+          !useMaskedInterleavedAccesses(TTI)) {
+        LLVM_DEBUG(
+            dbgs()
+            << "LV: Invalidate all interleaved groups due to fold-tail by masking "
+               "which requires masked-interleaved support.\n");
+        if (EpilogueTailFoldedCM->InterleaveInfo.invalidateGroups())
+          // Invalidating interleave groups also requires invalidating all decisions
+          // based on them, which includes widening decisions and uniform and scalar
+          // values.
+          EpilogueTailFoldedCM->invalidateCostModelingDecisions();
+      }
+    } else
+      EpilogueTailFoldedCM = nullptr;
+  }
+
+  // TODO: Does this only apply for predicated main body?
+  if (CM.foldTailByMasking() || (EpilogueTailFoldedCM && EpilogueTailFoldedCM->foldTailByMasking()))
     Legal->prepareToFoldTailByMasking();
 
   ElementCount MaxUserVF =
       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
   if (UserVF) {
+    assert(!EpilogueTailFoldedCM);
     if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {
       reportVectorizationInfo(
           "UserVF ignored because it may be larger than the maximal safe VF",
@@ -6861,6 +6984,14 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
     CM.collectNonVectorizedAndSetWideningDecisions(VF);
   }
 
+  if (EpilogueTailFoldedCM) {
+    EpilogueTailFoldedCM->collectInLoopReductions();
+    for (const auto &VF : VFCandidates) {
+      // Collect Uniform and Scalar instructions after vectorization with VF.
+      EpilogueTailFoldedCM->collectNonVectorizedAndSetWideningDecisions(VF);
+    }
+  }
+
   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
 
@@ -7290,6 +7421,56 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
     }
   }
 
+if (EpilogueTailFoldedCM) {
+  // Find profitable VFs for vector epilogue.
+  ProfitableVFs.clear();
+
+  for (auto &P : EpilogueTailFoldedPlans) {
+    ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
+                               P->vectorFactors().end());
+
+    SmallVector<VPRegisterUsage, 8> RUs;
+    if (EpilogueTailFoldedCM->useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
+        EpilogueTailFoldedCM->useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
+      RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, EpilogueTailFoldedCM->ValuesToIgnore);
+
+    for (unsigned I = 0; I < VFs.size(); I++) {
+      ElementCount VF = VFs[I];
+      if (VF.isScalar())
+        continue;
+      if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
+        LLVM_DEBUG(
+            dbgs()
+            << "LV: Not considering vector loop of width " << VF
+            << " because it will not generate any vector instructions.\n");
+        continue;
+      }
+      if (EpilogueTailFoldedCM->OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
+        LLVM_DEBUG(
+            dbgs()
+            << "LV: Not considering vector loop of width " << VF
+            << " because it would cause replicated blocks to be generated,"
+            << " which isn't allowed when optimizing for size.\n");
+        continue;
+      }
+
+      InstructionCost Cost = cost(*P, VF);
+      VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
+
+      if (EpilogueTailFoldedCM->shouldConsiderRegPressureForVF(VF) &&
+        RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
+        LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
+                        << VF << " because it uses too many registers\n");
+        continue;
+      }
+
+       // If profitable add it to ProfitableVF list.
+      if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
+        ProfitableVFs.push_back(CurrentFactor);
+    }
+  }
+}
+
 #ifndef NDEBUG
   // Select the optimal vectorization factor according to the legacy cost-model.
   // This is now only used to verify the decisions by the new VPlan-based
@@ -7638,10 +7819,17 @@ BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
     VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH);
   }
 
-  BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
-  if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
-    setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
-  ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
+  if (ForEpilogue && isEpilogueTailFolded) {
+    BranchInst &BI =
+        *BranchInst::Create(Bypass, VectorPH, Builder.getFalse());
+    ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
+  } else {
+    BranchInst &BI =
+        *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
+    if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
+      setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
+    ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
+  }
 
   // When vectorizing the main loop, its trip-count check is placed in a new
   // block, whereas the overall trip-count check is placed in the VPlan entry
@@ -7660,8 +7848,13 @@ BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
 /// entry block to the epilogue VPlan. The minimum iteration check is being
 /// represented in VPlan.
 BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
-  BasicBlock *NewScalarPH = createScalarPreheader("vec.epilog.");
-  BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
+  BasicBlock *OriginalScalarPH = nullptr;
+  if (isEpilogueTailFolded)
+    OriginalScalarPH = OrigLoop->getLoopPreheader();
+  else {
+    BasicBlock *NewScalarPH = createScalarPreheader("vec.epilog.");
+    OriginalScalarPH = NewScalarPH->getSinglePredecessor();
+  }
   OriginalScalarPH->setName("vec.epilog.iter.check");
   VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(OriginalScalarPH);
   VPBasicBlock *OldEntry = Plan.getEntry();
@@ -7719,7 +7912,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
 
   // If a mask is not required, drop it - use unmasked version for safe loads.
   // TODO: Determine if mask is needed in VPlan.
-  VPValue *Mask = Legal->isMaskRequired(I) ? VPI->getMask() : nullptr;
+  VPValue *Mask = CM.isMaskRequired(I) ? VPI->getMask() : nullptr;
 
   // Determine if the pointer operand of the access is either consecutive or
   // reverse consecutive.
@@ -7986,7 +8179,7 @@ VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
 
   // In case of predicated execution (due to tail-folding, or conditional
   // execution, or both), pass the relevant mask.
-  if (Legal->isMaskRequired(HI->Store))
+  if (CM.isMaskRequired(HI->Store))
     HGramOps.push_back(VPI->getMask());
 
   return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
@@ -8139,7 +8332,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
-    if (auto Plan = tryToBuildVPlanWithVPRecipes(
+    if (auto Plan = tryToBuildVPlanWithVPRecipes(&CM,
             std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
       // Now optimize the initial VPlan.
       VPlanTransforms::hoistPredicatedLoads(*Plan, PSE, OrigLoop);
@@ -8158,10 +8351,30 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
     }
     VF = SubRange.End;
   }
+
+  if (!isEpilogueTailFolded())
+    return;
+  for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
+    VFRange SubRange = {VF, MaxVFTimes2};
+    if (auto Plan = tryToBuildVPlanWithVPRecipes(
+            EpilogueTailFoldedCM, std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
+      bool HasScalarVF = Plan->hasScalarVFOnly();
+      // Now optimize the initial VPlan.
+      if (!HasScalarVF)
+        RUN_VPLAN_PASS(VPlanTransforms::truncateToMinimalBitwidths,
+                                 *Plan, EpilogueTailFoldedCM->getMinimalBitwidths());
+      RUN_VPLAN_PASS(VPlanTransforms::optimize, *Plan);
+      assert(!EpilogueTailFoldedCM->foldTailWithEVL());
+      assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
+      EpilogueTailFoldedPlans.push_back(std::move(Plan));
+    }
+    VF = SubRange.End;
+  }
 }
 
 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
-    VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
+    LoopVectorizationCostModel *Cost, VPlanPtr Plan, VFRange &Range,
+    LoopVersioning *LVer) {
 
   using namespace llvm::VPlanPatternMatch;
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8173,14 +8386,16 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
 
   bool RequiresScalarEpilogueCheck =
       LoopVectorizationPlanner::getDecisionAndClampRange(
-          [this](ElementCount VF) {
-            return !CM.requiresScalarEpilogue(VF.isVector());
+          [Cost](ElementCount VF) {
+            return !Cost->requiresScalarEpilogue(VF.isVector());
           },
           Range);
+
+  // TODO: foldTailByMasking needs to return different answers depending upon
+  // whether it's for the main body or the vector epilogue.
   VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit());
   VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
-                                  CM.foldTailByMasking());
-
+                                  Cost->foldTailByMasking());
   VPlanTransforms::createLoopRegions(*Plan);
 
   // Don't use getDecisionAndClampRange here, because we don't know the UF
@@ -8189,9 +8404,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
   bool IVUpdateMayOverflow = false;
   for (ElementCount VF : Range)
-    IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
+    IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(Cost, VF);
 
-  TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
+  // TODO: Is the answer the same for both vector main body and epilogue?
+  TailFoldingStyle Style = Cost->getTailFoldingStyle(IVUpdateMayOverflow);
   // Use NUW for the induction increment if we proved that it won't overflow in
   // the vector loop or when not folding the tail. In the later case, we know
   // that the canonical induction increment will not overflow as the vector trip
@@ -8218,9 +8434,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // placeholders for its members' Recipes which we'll be replacing with a
   // single VPInterleaveRecipe.
   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
-    auto ApplyIG = [IG, this](ElementCount VF) -> bool {
+    auto ApplyIG = [IG, Cost](ElementCount VF) -> bool {
       bool Result = (VF.isVector() && // Query is illegal for VF == 1
-                     CM.getWideningDecision(IG->getInsertPos(), VF) ==
+                     Cost->getWideningDecision(IG->getInsertPos(), VF) ==
                          LoopVectorizationCostModel::CM_Interleave);
       // For scalable vectors, the interleave factors must be <= 8 since we
       // require the (de)interleaveN intrinsics instead of shufflevectors.
@@ -8236,13 +8452,13 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // ---------------------------------------------------------------------------
   // Predicate and linearize the top-level loop region.
   // ---------------------------------------------------------------------------
-  VPlanTransforms::introduceMasksAndLinearize(*Plan, CM.foldTailByMasking());
+  VPlanTransforms::introduceMasksAndLinearize(*Plan, Cost->foldTailByMasking());
 
   // ---------------------------------------------------------------------------
   // Construct wide recipes and apply predication for original scalar
   // VPInstructions in the loop.
   // ---------------------------------------------------------------------------
-  VPRecipeBuilder RecipeBuilder(*Plan, TLI, Legal, CM, Builder);
+  VPRecipeBuilder RecipeBuilder(*Plan, TLI, Legal, *Cost, Builder);
 
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
@@ -8337,7 +8553,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // bring the VPlan to its final state.
   // ---------------------------------------------------------------------------
 
-  addReductionResultComputation(Plan, RecipeBuilder, Range.Start);
+  addReductionResultComputation(Cost, Plan, RecipeBuilder, Range.Start);
 
   // Optimize FindIV reductions to use sentinel-based approach when possible.
   RUN_VPLAN_PASS(VPlanTransforms::optimizeFindIVReductions, *Plan, PSE,
@@ -8361,9 +8577,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // range for better cost estimation.
   // TODO: Enable following transform when the EVL-version of extended-reduction
   // and mulacc-reduction are implemented.
-  if (!CM.foldTailWithEVL()) {
-    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
-                          OrigLoop);
+  if (!Cost->foldTailWithEVL()) {
+    VPCostContext CostCtx(Cost->TTI, *Cost->TLI, *Plan, *Cost, Cost->CostKind,
+                          Cost->PSE, OrigLoop);
     RUN_VPLAN_PASS(VPlanTransforms::createPartialReductions, *Plan, CostCtx,
                    Range);
     RUN_VPLAN_PASS(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx,
@@ -8378,7 +8594,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // for this VPlan, replace the Recipes widening its memory instructions with a
   // single VPInterleaveRecipe at its insertion point.
   RUN_VPLAN_PASS(VPlanTransforms::createInterleaveGroups, *Plan,
-                 InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
+                 InterleaveGroups, RecipeBuilder, Cost->isScalarEpilogueAllowed());
 
   // Replace VPValues for known constant strides.
   RUN_VPLAN_PASS(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
@@ -8453,7 +8669,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
 }
 
 void LoopVectorizationPlanner::addReductionResultComputation(
-    VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
+    LoopVectorizationCostModel *Cost, VPlanPtr &Plan,
+    VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
   using namespace VPlanPatternMatch;
   VPTypeAnalysis TypeInfo(*Plan);
   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
@@ -8716,7 +8933,8 @@ void LoopVectorizationPlanner::addMinimumIterationCheck(
 static ScalarEpilogueLowering getScalarEpilogueLowering(
     Function *F, Loop *L, LoopVectorizeHints &Hints, bool OptForSize,
     TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
-    LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
+    LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI,
+    bool ForEpilogue = false) {
   // 1) OptSize takes precedence over all other options, i.e. if this is set,
   // don't look at hints or options, and don't request a scalar epilogue.
   if (F->hasOptSize() ||
@@ -8725,11 +8943,13 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
 
   // 2) If set, obey the directives
   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
+    if (ForEpilogue && PreferPredicateOverEpilogue == PreferPredicateTy::PredicatedEpilogue)
+      return CM_ScalarEpilogueNotNeededUsePredicatedEpilogue;
     switch (PreferPredicateOverEpilogue) {
     case PreferPredicateTy::ScalarEpilogue:
       return CM_ScalarEpilogueAllowed;
     case PreferPredicateTy::PredicateElseScalarEpilogue:
-      return CM_ScalarEpilogueNotNeededUsePredicate;
+      return CM_ScalarEpilogueNotNeededUsePredicatedBody;
     case PreferPredicateTy::PredicateOrDontVectorize:
       return CM_ScalarEpilogueNotAllowedUsePredicate;
     };
@@ -8738,7 +8958,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
   // 3) If set, obey the hints
   switch (Hints.getPredicate()) {
   case LoopVectorizeHints::FK_Enabled:
-    return CM_ScalarEpilogueNotNeededUsePredicate;
+    return CM_ScalarEpilogueNotNeededUsePredicatedBody;
   case LoopVectorizeHints::FK_Disabled:
     return CM_ScalarEpilogueAllowed;
   };
@@ -8746,7 +8966,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
   // 4) if the TTI hook indicates this is profitable, request predication.
   TailFoldingInfo TFI(TLI, &LVL, IAI);
   if (TTI->preferPredicateOverEpilogue(&TFI))
-    return CM_ScalarEpilogueNotNeededUsePredicate;
+    return CM_ScalarEpilogueNotNeededUsePredicatedBody;
 
   return CM_ScalarEpilogueAllowed;
 }
@@ -9017,7 +9237,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
   SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
   for (VPRecipeBase &R :
        EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
-    if (isa<VPCanonicalIVPHIRecipe>(&R))
+    if (isa<VPCanonicalIVPHIRecipe>(&R) || isa<VPActiveLaneMaskPHIRecipe>(&R))
       continue;
     EpiWidenedPhis.insert(
         cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
@@ -9120,8 +9340,9 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
   // FIXME: Improve modeling for canonical IV start values in the epilogue
   // loop.
   using namespace llvm::PatternMatch;
-  PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
-  for (Value *Inc : EPResumeVal->incoming_values()) {
+  PHINode *IVResumeVal = &*L->getLoopPreheader()->phis().begin();
+
+  for (Value *Inc : IVResumeVal->incoming_values()) {
     if (match(Inc, m_SpecificInt(0)))
       continue;
     assert(!EPI.VectorTripCount &&
@@ -9134,20 +9355,22 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
   // TODO: We should not choose VF * UF so the main vector loop is known to
   // be dead.
   if (!EPI.VectorTripCount) {
-    assert(EPResumeVal->getNumIncomingValues() > 0 &&
-           all_of(EPResumeVal->incoming_values(),
+    assert(IVResumeVal->getNumIncomingValues() > 0 &&
+           all_of(IVResumeVal->incoming_values(),
                   [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
            "all incoming values must be 0");
-    EPI.VectorTripCount = EPResumeVal->getOperand(0);
+    EPI.VectorTripCount = IVResumeVal->getOperand(0);
   }
-  VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
+  VPValue *VPV = Plan.getOrAddLiveIn(IVResumeVal);
   assert(all_of(IV->users(),
                 [](const VPUser *U) {
                   return isa<VPScalarIVStepsRecipe>(U) ||
                          isa<VPDerivedIVRecipe>(U) ||
                          cast<VPRecipeBase>(U)->isScalarCast() ||
                          cast<VPInstruction>(U)->getOpcode() ==
-                             Instruction::Add;
+                             Instruction::Add ||
+                         cast<VPInstruction>(U)->getOpcode() ==
+                             VPInstruction::CanonicalIVIncrementForPart;
                 }) &&
          "the canonical IV should only be used by its increment or "
          "ScalarIVSteps when resetting the start value");
@@ -9163,6 +9386,15 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
   // handled above.
   for (VPRecipeBase &R : drop_begin(Header->phis())) {
     Value *ResumeV = nullptr;
+    if (isa<VPActiveLaneMaskPHIRecipe>(&R)) {
+      // Needs extracting from the start value ActiveLaneMask instruction.
+      auto *ALM = cast<VPInstruction>(cast<VPActiveLaneMaskPHIRecipe>(&R)->getOperand(0));
+      assert(ALM->getOpcode() == VPInstruction::ActiveLaneMask);
+      assert(IVResumeVal && "must have a resume value for the canonical IV");
+      VPValue *VPV = Plan.getOrAddLiveIn(IVResumeVal);
+      ALM->setOperand(0, VPV);
+       continue;
+    }
     // TODO: Move setting of resume values to prepareToExecute.
     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
       // Find the reduction result by searching users of the phi or its backedge
@@ -9281,7 +9513,8 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
   VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
       Plan, EPI.TripCount, EPI.VectorTripCount,
       CM.requiresScalarEpilogue(EPI.EpilogueVF.isVector()), EPI.EpilogueVF,
-      EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
+      EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE,
+      CM.foldTailByMasking());
 
   return InstsToMove;
 }
@@ -9313,7 +9546,8 @@ static Value *createInductionAdditionalBypassValues(
   return EndValueFromAdditionalBypass;
 }
 
-static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
+static void fixScalarResumeValuesFromBypass(LoopVectorizationPlanner &LVP,
+                                            BasicBlock *BypassBlock, Loop *L,
                                             VPlan &BestEpiPlan,
                                             LoopVectorizationLegality &LVL,
                                             const SCEV2ValueTy &ExpandedSCEVs,
@@ -9340,13 +9574,15 @@ static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
 
   // Fix induction resume values from the additional bypass block.
   IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
-  for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
-    auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
-    Value *V = createInductionAdditionalBypassValues(
-        IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
-        LVL.getPrimaryInduction());
-    // TODO: Directly add as extra operand to the VPResumePHI recipe.
-    Inc->setIncomingValueForBlock(BypassBlock, V);
+  if (!LVP.isEpilogueTailFolded()) {
+    for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
+      auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
+      Value *V = createInductionAdditionalBypassValues(
+          IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
+          LVL.getPrimaryInduction());
+      // TODO: Directly add as extra operand to the VPResumePHI recipe.
+      Inc->setIncomingValueForBlock(BypassBlock, V);
+    }
   }
 }
 
@@ -9356,6 +9592,7 @@ static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
 // InstsToMove contains instructions that need to be moved to the preheader of
 // the epilogue vector loop.
 static void connectEpilogueVectorLoop(
+    LoopVectorizationPlanner &LVP,
     VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI,
     DominatorTree *DT, LoopVectorizationLegality &LVL,
     DenseMap<const SCEV *, Value *> &ExpandedSCEVs, GeneratedRTChecks &Checks,
@@ -9437,11 +9674,11 @@ static void connectEpilogueVectorLoop(
   auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
   for (auto *I : InstsToMove)
     I->moveBefore(IP);
-
+  
   // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
   // after executing the main loop. We need to update the resume values of
   // inductions and reductions during epilogue vectorization.
-  fixScalarResumeValuesFromBypass(VecEpilogueIterationCountCheck, L, EpiPlan,
+  fixScalarResumeValuesFromBypass(LVP, VecEpilogueIterationCountCheck, L, EpiPlan,
                                   LVL, ExpandedSCEVs, EPI.VectorTripCount);
 }
 
@@ -9576,7 +9813,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       // with runtime checks. It's more effective to let
       // `isOutsideLoopWorkProfitable` determine if vectorization is
       // beneficial for the loop.
-      if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
+      if (SEL != CM_ScalarEpilogueNotNeededUsePredicatedBody &&
+          SEL != CM_ScalarEpilogueNotNeededUsePredicatedEpilogue)
         SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
     }
   }
@@ -9630,6 +9868,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Use the cost model.
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
                                 GetBFI, F, &Hints, IAI, OptForSize);
+  ScalarEpilogueLowering EpilogueTailFoldedSEL =
+      getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL, &IAI,
+                                /*ForEpilogue*/ true);
+  LoopVectorizationCostModel EpilogueTailFoldedCM(EpilogueTailFoldedSEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
+                                 GetBFI, F, &Hints, IAI, OptForSize);
+
   // Use the planner for vectorization.
   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
                                ORE);
@@ -9640,6 +9884,24 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
     UserIC = 1;
 
+  // What about predicated hint?
+  if (EpilogueTailFoldedSEL == CM_ScalarEpilogueNotNeededUsePredicatedEpilogue
+      && !CM.requiresScalarEpilogue(true)) {
+    bool HasReductions = !LVL.getReductionVars().empty();
+    bool HasSelectCmpReductions =
+        HasReductions &&
+        any_of(LVL.getReductionVars(), [&](auto &Reduction) -> bool {
+          const RecurrenceDescriptor &RdxDesc = Reduction.second;
+          RecurKind RK = RdxDesc.getRecurrenceKind();
+          return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
+                 RecurrenceDescriptor::isFindIVRecurrenceKind(RK) ||
+                 RecurrenceDescriptor::isMinMaxRecurrenceKind(RK);
+        });
+    if (!HasSelectCmpReductions) {
+      LVP.setEpilogueTailFoldingCM(&EpilogueTailFoldedCM);
+    }
+  }
+
   // Plan how to best vectorize.
   LVP.plan(UserVF, UserIC);
   VectorizationFactor VF = LVP.computeBestVF();
@@ -9712,7 +9974,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                   "Ignoring user-specified interleave count due to possibly "
                   "unsafe dependencies in the loop."};
     InterleaveLoop = false;
-  } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
+  } else if (!LVP.hasPlanWithVF(VF.Width) &&
+              UserIC > 1) {
     // Tell the user interleaving was avoided up-front, despite being explicitly
     // requested.
     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
@@ -9835,27 +10098,30 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     // The first pass vectorizes the main loop and creates a scalar epilogue
     // to be vectorized by executing the plan (potentially with a different
     // factor) again shortly afterwards.
-    VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
+    VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width, /*ForEpilogue*/ true);
     BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
     BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
     preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
     EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
                                       BestEpiPlan);
     EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
-                                       Checks, *BestMainPlan);
+                                       Checks, *BestMainPlan,
+                                       LVP.isEpilogueTailFolded());
     auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
                                          *BestMainPlan, MainILV, DT, false);
     ++LoopsVectorized;
 
     // Second pass vectorizes the epilogue and adjusts the control flow
     // edges from the first pass.
-    EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
-                                             Checks, BestEpiPlan);
+    EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, 
+                                             LVP.isEpilogueTailFolded() ? LVP.getEpilogueTailFoldingCM() : &CM,
+                                             Checks, BestEpiPlan, LVP.isEpilogueTailFolded());
     SmallVector<Instruction *> InstsToMove = preparePlanForEpilogueVectorLoop(
-        BestEpiPlan, L, ExpandedSCEVs, EPI, CM, *PSE.getSE());
+        BestEpiPlan, L, ExpandedSCEVs, EPI, LVP.isEpilogueTailFolded() ? *LVP.getEpilogueTailFoldingCM() : CM, *PSE.getSE());
     LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
                     true);
-    connectEpilogueVectorLoop(BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs,
+    if (!LVP.isEpilogueTailFolded())
+      connectEpilogueVectorLoop(LVP, BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs,
                               Checks, InstsToMove);
     ++LoopsEpilogueVectorized;
   } else {
@@ -9872,6 +10138,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     ++LoopsVectorized;
   }
 
+  LLVM_DEBUG(dbgs() << "hassnaa: [processLoop]: final function: "; F->dump());
   assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
          "DT not preserved correctly");
   assert(!verifyFunction(*F, &dbgs()));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 9b22c399d7acf..a0dc620206e6e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1590,13 +1590,16 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
   }
 }
 
-VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF) const {
-  assert(count_if(VPlans,
+VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF,
+                                            bool ForEpilogue) const {
+  bool UseEpilogueTailFoldedPlans = ForEpilogue && isEpilogueTailFolded();
+  assert(count_if(UseEpilogueTailFoldedPlans ? EpilogueTailFoldedPlans : VPlans,
                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
              1 &&
          "Multiple VPlans for VF.");
 
-  for (const VPlanPtr &Plan : VPlans) {
+  for (const VPlanPtr &Plan : 
+        UseEpilogueTailFoldedPlans ? EpilogueTailFoldedPlans : VPlans) {
     if (Plan->hasVF(VF))
       return *Plan.get();
   }
@@ -1745,6 +1748,15 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
       Plan->printDOT(O);
     else
       Plan->print(O);
+
+  if (EpilogueTailFoldedPlans.empty())
+    return;
+  O << "LV: Printing out predicated plans\n";
+  for (const auto &Plan : EpilogueTailFoldedPlans)
+    if (PrintVPlansInDotFormat)
+      Plan->printDOT(O);
+    else
+      Plan->print(O);
 }
 #endif
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index da1035847020a..315120bf0ed1a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1120,10 +1120,17 @@ void VPlanTransforms::addMinimumIterationCheck(
 void VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
     VPlan &Plan, Value *TripCount, Value *VectorTripCount,
     bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF,
-    unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE) {
+    unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE,
+    bool EpilogueTailFolded) {
   // Add the minimum iteration check for the epilogue vector loop.
-  VPValue *TC = Plan.getOrAddLiveIn(TripCount);
   VPBuilder Builder(cast<VPBasicBlock>(Plan.getEntry()));
+  if (EpilogueTailFolded) {
+    // Always branch to epilogue vector loop (always false condition)
+    Builder.createNaryOp(VPInstruction::BranchOnCond, Plan.getFalse());
+    return;
+  }
+
+  VPValue *TC = Plan.getOrAddLiveIn(TripCount);
   VPValue *VFxUF = Builder.createExpandSCEV(SE.getElementCount(
       TripCount->getType(), (EpilogueVF * EpilogueUF), SCEV::FlagNUW));
   VPValue *Count = Builder.createSub(TC, Plan.getOrAddLiveIn(VectorTripCount),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index da3afe7ce6d03..75c39fe67b142 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2673,6 +2673,9 @@ void VPlanTransforms::truncateToMinimalBitwidths(
   // cannot use RAUW after creating a new truncate, as this would could make
   // other uses have different types for their operands, making them invalidly
   // typed.
+  if (MinBWs.empty()) {
+    return;
+  }
   DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
   VPTypeAnalysis TypeInfo(Plan);
   VPBasicBlock *PH = Plan.getVectorPreheader();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5fc68c2df145f..7e657965b235b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -165,7 +165,8 @@ struct VPlanTransforms {
   static void addMinimumVectorEpilogueIterationCheck(
       VPlan &Plan, Value *TripCount, Value *VectorTripCount,
       bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF,
-      unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE);
+      unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE,
+      bool EpilogueTailFolded);
 
   /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's
   /// flat CFG into a hierarchical CFG.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-tail-folded-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-tail-folded-vect.ll
new file mode 100644
index 0000000000000..a3e1bb2f3c522
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-tail-folded-vect.ll
@@ -0,0 +1,393 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^for.body:" --filter-out-after "^loop:"
+; REQUIRES: asserts
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-force-VF=8 \
+; RUN:   -prefer-predicate-over-epilogue=predicated-epilogue -debug-only=loop-vectorize -S 2>%t | FileCheck %s --check-prefix=CHECK-VF8
+; RUN: cat %t | FileCheck %s --check-prefix=DEBUG-FORCED
+
+target triple = "aarch64-linux-gnu"
+
+define void @main_vf_vscale_x_16(ptr %A) #0 {
+;
+; CHECK-VF8-LABEL: @main_vf_vscale_x_16(
+; CHECK-VF8-NEXT:  iter.check:
+; CHECK-VF8-NEXT:    br i1 false, label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-VF8:       vector.main.loop.iter.check:
+; CHECK-VF8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 5
+; CHECK-VF8-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-VF8-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_ITER_CHECK]], label [[VECTOR_PH:%.*]]
+; CHECK-VF8:       vector.ph:
+; CHECK-VF8-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
+; CHECK-VF8-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1
+; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8:       vector.body:
+; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP3]]
+; CHECK-VF8-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP5]], align 1
+; CHECK-VF8-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP6]], align 1
+; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-VF8-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF8:       middle.block:
+; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK]]
+; CHECK-VF8:       vec.epilog.iter.check:
+; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_PH:%.*]]
+; CHECK-VF8:       vec.epilog.ph:
+; CHECK-VF8-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[VEC_EPILOG_RESUME_VAL]], i64 1024)
+; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-VF8:       vec.epilog.vector.body:
+; CHECK-VF8-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VEC_EPILOG_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX1]]
+; CHECK-VF8-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> splat (i8 1), ptr align 1 [[TMP8]], <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-VF8-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], 8
+; CHECK-VF8-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX_NEXT2]], i64 1024)
+; CHECK-VF8-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-VF8-NEXT:    [[TMP10:%.*]] = xor i1 [[TMP9]], true
+; CHECK-VF8-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF8:       vec.epilog.middle.block:
+; CHECK-VF8-NEXT:    br label [[EXIT]]
+; CHECK-VF8:       exit:
+; CHECK-VF8-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %A, i64 %iv
+  store i8 1, ptr %arrayidx, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, 1024
+  br i1 %exitcond, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) {
+;
+; CHECK-VF8-LABEL: @main_vf_vscale_x_2_no_epi_iteration(
+; CHECK-VF8-NEXT:  iter.check:
+; CHECK-VF8-NEXT:    br i1 false, label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-VF8:       vector.main.loop.iter.check:
+; CHECK-VF8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-VF8-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-VF8-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_ITER_CHECK]], label [[VECTOR_PH:%.*]]
+; CHECK-VF8:       vector.ph:
+; CHECK-VF8-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1
+; CHECK-VF8-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1
+; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8:       vector.body:
+; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 [[TMP3]]
+; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP5]], align 1
+; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP6]], align 1
+; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-VF8-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF8:       middle.block:
+; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK]]
+; CHECK-VF8:       vec.epilog.iter.check:
+; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_PH:%.*]]
+; CHECK-VF8:       vec.epilog.ph:
+; CHECK-VF8-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[VEC_EPILOG_RESUME_VAL]], i64 1024)
+; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-VF8:       vec.epilog.vector.body:
+; CHECK-VF8-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VEC_EPILOG_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX1]]
+; CHECK-VF8-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> splat (i64 1), ptr align 1 [[TMP8]], <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-VF8-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], 8
+; CHECK-VF8-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX_NEXT2]], i64 1024)
+; CHECK-VF8-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-VF8-NEXT:    [[TMP10:%.*]] = xor i1 [[TMP9]], true
+; CHECK-VF8-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF8:       vec.epilog.middle.block:
+; CHECK-VF8-NEXT:    br label [[EXIT]]
+; CHECK-VF8:       exit:
+; CHECK-VF8-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i64, ptr %A, i64 %iv
+  store i64 1, ptr %arrayidx, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, 1024
+  br i1 %exitcond, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) {
+;
+; CHECK-VF8-LABEL: @main_vf_vscale_x_2(
+; CHECK-VF8-NEXT:  iter.check:
+; CHECK-VF8-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; CHECK-VF8-NEXT:    br i1 false, label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-VF8:       vector.main.loop.iter.check:
+; CHECK-VF8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-VF8-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-VF8-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_ITER_CHECK]], label [[VECTOR_PH:%.*]]
+; CHECK-VF8:       vector.ph:
+; CHECK-VF8-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1
+; CHECK-VF8-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1
+; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
+; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8:       vector.body:
+; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 [[TMP3]]
+; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP5]], align 1
+; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP6]], align 1
+; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-VF8-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF8:       middle.block:
+; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK]]
+; CHECK-VF8:       vec.epilog.iter.check:
+; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_PH:%.*]]
+; CHECK-VF8:       vec.epilog.ph:
+; CHECK-VF8-NEXT:    [[TMP8:%.*]] = sub i64 [[N]], 8
+; CHECK-VF8-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 8
+; CHECK-VF8-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
+; CHECK-VF8-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[VEC_EPILOG_RESUME_VAL]], i64 [[N]])
+; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-VF8:       vec.epilog.vector.body:
+; CHECK-VF8-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VEC_EPILOG_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX2]]
+; CHECK-VF8-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> splat (i64 1), ptr align 1 [[TMP11]], <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-VF8-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX2]], 8
+; CHECK-VF8-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX2]], i64 [[TMP10]])
+; CHECK-VF8-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-VF8-NEXT:    [[TMP13:%.*]] = xor i1 [[TMP12]], true
+; CHECK-VF8-NEXT:    br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-VF8:       vec.epilog.middle.block:
+; CHECK-VF8-NEXT:    br label [[EXIT]]
+; CHECK-VF8:       exit:
+; CHECK-VF8-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i64, ptr %A, i64 %iv
+  store i64 1, ptr %arrayidx, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, %n
+  br i1 %exitcond, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_pr57912_pointer_induction(ptr %start) #0 {
+;
+; CHECK-VF8-LABEL: @test_pr57912_pointer_induction(
+; CHECK-VF8-NEXT:  iter.check:
+; CHECK-VF8-NEXT:    br i1 false, label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-VF8:       vector.main.loop.iter.check:
+; CHECK-VF8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 5
+; CHECK-VF8-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 10000, [[TMP1]]
+; CHECK-VF8-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_ITER_CHECK]], label [[VECTOR_PH:%.*]]
+; CHECK-VF8:       vector.ph:
+; CHECK-VF8-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
+; CHECK-VF8-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1
+; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 10000, [[TMP4]]
+; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 10000, [[N_MOD_VF]]
+; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8:       vector.body:
+; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP3]]
+; CHECK-VF8-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[NEXT_GEP]], align 1
+; CHECK-VF8-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP5]], align 1
+; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-VF8-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF8:       middle.block:
+; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK]]
+; CHECK-VF8:       vec.epilog.iter.check:
+; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_PH:%.*]]
+; CHECK-VF8:       vec.epilog.ph:
+; CHECK-VF8-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[VEC_EPILOG_RESUME_VAL]], i64 10000)
+; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK-VF8:       vec.epilog.vector.body:
+; CHECK-VF8-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VEC_EPILOG_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX1]]
+; CHECK-VF8-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> zeroinitializer, ptr align 1 [[NEXT_GEP2]], <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-VF8-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], 8
+; CHECK-VF8-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[INDEX_NEXT3]], i64 10000)
+; CHECK-VF8-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; CHECK-VF8-NEXT:    [[TMP8:%.*]] = xor i1 [[TMP7]], true
+; CHECK-VF8-NEXT:    br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF8:       vec.epilog.middle.block:
+; CHECK-VF8-NEXT:    br label [[EXIT]]
+; CHECK-VF8:       exit:
+; CHECK-VF8-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop ]
+  store i8 0, ptr %ptr.iv, align 1
+  %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cmp = icmp eq i64 %iv.next, 10000
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 16) #0 {
+;
+; CHECK-VF8-LABEL: @trip_count_vscale(
+; CHECK-VF8-NEXT:  entry:
+; CHECK-VF8-NEXT:    [[V:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[N:%.*]] = mul nuw nsw i64 [[V]], 1033
+; CHECK-VF8-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-VF8:       vector.ph:
+; CHECK-VF8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-VF8-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
+; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP2]]
+; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8:       vector.body:
+; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[TMP1]]
+; CHECK-VF8-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4
+; CHECK-VF8-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i64 [[TMP1]]
+; CHECK-VF8-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
+; CHECK-VF8-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; CHECK-VF8-NEXT:    [[TMP7:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-VF8-NEXT:    [[TMP8:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
+; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP7]], ptr [[TMP5]], align 4
+; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP8]], ptr [[TMP6]], align 4
+; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-VF8-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-VF8:       middle.block:
+; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK-VF8:       scalar.ph:
+; CHECK-VF8-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-VF8:       for.body:
+;
+entry:
+  %v = tail call i64 @llvm.vscale.i64()
+  %n = mul nuw nsw i64 %v, 1033
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw float, ptr %a, i64 %iv
+  %l.a = load float, ptr %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds nuw float, ptr %b, i64 %iv
+  %l.b = load float, ptr %arrayidx3, align 4
+  %mul4 = fmul float %l.a, %l.b
+  store float %mul4, ptr %arrayidx3, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalias %b) vscale_range(1, 16) #0 {
+;
+; CHECK-VF8-LABEL: @trip_count_vscale_no_epilogue_iterations(
+; CHECK-VF8-NEXT:  entry:
+; CHECK-VF8-NEXT:    [[V:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[N:%.*]] = mul nuw nsw i64 [[V]], 1024
+; CHECK-VF8-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-VF8:       vector.ph:
+; CHECK-VF8-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF8-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-VF8-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
+; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP2]]
+; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-VF8:       vector.body:
+; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-VF8-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i64 [[TMP1]]
+; CHECK-VF8-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP3]], align 4
+; CHECK-VF8-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP4]], align 4
+; CHECK-VF8-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[TMP5]], i64 [[TMP1]]
+; CHECK-VF8-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
+; CHECK-VF8-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; CHECK-VF8-NEXT:    [[TMP7:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-VF8-NEXT:    [[TMP8:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
+; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP7]], ptr [[TMP5]], align 4
+; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP8]], ptr [[TMP6]], align 4
+; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-VF8-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-VF8:       middle.block:
+; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK-VF8:       scalar.ph:
+; CHECK-VF8-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-VF8:       for.body:
+;
+entry:
+  %v = tail call i64 @llvm.vscale.i64()
+  %n = mul nuw nsw i64 %v, 1024
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw float, ptr %a, i64 %iv
+  %l.a = load float, ptr %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds nuw float, ptr %b, i64 %iv
+  %l.b = load float, ptr %arrayidx3, align 4
+  %mul4 = fmul float %l.a, %l.b
+  store float %mul4, ptr %arrayidx3, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }



More information about the llvm-commits mailing list