[llvm] This is a draft for enabling opt-in tail-folding on vectorized epilogue. (PR #181401)

Fri Feb 13 11:01:38 PST 2026

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. :warning:

<details>
<summary>
You can test this locally with the following command:
</summary>

``````````bash
git-clang-format --diff origin/main HEAD --extensions h,cpp -- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h llvm/lib/Transforms/Utils/BasicBlockUtils.cpp llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h llvm/lib/Transforms/Vectorize/LoopVectorize.cpp llvm/lib/Transforms/Vectorize/VPlan.cpp llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp llvm/lib/Transforms/Vectorize/VPlanTransforms.h --diff_from_common_commit
``````````

:warning:
The reproduction instructions above might return results for more than one PR
in a stack if you are using a stacked PR workflow. You can limit the results by
changing `origin/main` to the base branch/commit you want to compare against.
:warning:

</details>

<details>
<summary>
View the diff from clang-format here.
</summary>

``````````diff

diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index b0eb4dc96..cd8ba5b4b 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -152,7 +152,7 @@ void llvm::DeleteDeadBlocks(ArrayRef <BasicBlock *> BBs, DomTreeUpdater *DTU,
 #ifndef NDEBUG
   // Make sure that all predecessors of each dead block is also dead.
   SmallPtrSet<BasicBlock *, 4> Dead(llvm::from_range, BBs);
-// assert(Dead.size() == BBs.size() && "Duplicating blocks?");
+  // assert(Dead.size() == BBs.size() && "Duplicating blocks?");
   for (auto *BB : Dead)
     for (BasicBlock *Pred : predecessors(BB))
       assert(Dead.count(Pred) && "All predecessors must be dead!");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 835723362..4a2491958 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -2156,7 +2156,8 @@ void LoopVectorizationLegality::prepareToFoldTailByMasking() {
   // Mark all blocks for predication, including those that ordinarily do not
   // need predication such as the header block.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    [[maybe_unused]] bool R = blockCanBePredicated(BB, SafePointers, TailFoldedMaskedOp);
+    [[maybe_unused]] bool R =
+        blockCanBePredicated(BB, SafePointers, TailFoldedMaskedOp);
     assert(R && "Must be able to predicate block when tail-folding.");
   }
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 39f8ffbb2..5cb0ac8e3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -540,7 +540,8 @@ public:
       PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints,
       OptimizationRemarkEmitter *ORE)
       : OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
-        EpilogueTailFoldedCM(nullptr), IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {}
+        EpilogueTailFoldedCM(nullptr), IAI(IAI), PSE(PSE), Hints(Hints),
+        ORE(ORE) {}
 
   void setEpilogueTailFoldingCM(LoopVectorizationCostModel *Cost) {
     EpilogueTailFoldedCM = Cost;
@@ -552,9 +553,7 @@ public:
 
   bool isEpilogueTailFolded() const;
 
-  void disableEpilogueTailFolding() {
-    EpilogueTailFoldedCM = nullptr;
-  }
+  void disableEpilogueTailFolding() { EpilogueTailFoldedCM = nullptr; }
 
   /// Build VPlans for the specified \p UserVF and \p UserIC if they are
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
@@ -603,8 +602,10 @@ public:
   /// Look through the existing plans and return true if we have one with
   /// vectorization factor \p VF.
   bool hasPlanWithVF(ElementCount VF, bool ForEpilogue = false) const {
-    return any_of((ForEpilogue && isEpilogueTailFolded()) ? EpilogueTailFoldedPlans : VPlans,
-                   [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); });
+    return any_of((ForEpilogue && isEpilogueTailFolded())
+                      ? EpilogueTailFoldedPlans
+                      : VPlans,
+                  [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); });
   }
 
   /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6755f8a81..c87af8731 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -204,35 +204,33 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
 // and predicate the instructions accordingly. If tail-folding fails, there are
 // different fallback strategies depending on these values:
 namespace PreferPredicateTy {
-  enum Option {
-    ScalarEpilogue = 0,
-    PredicateElseScalarEpilogue,
-    PredicateOrDontVectorize,
-    PredicatedEpilogue
-  };
+enum Option {
+  ScalarEpilogue = 0,
+  PredicateElseScalarEpilogue,
+  PredicateOrDontVectorize,
+  PredicatedEpilogue
+};
 } // namespace PreferPredicateTy
 
 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
     "prefer-predicate-over-epilogue",
-    cl::init(PreferPredicateTy::ScalarEpilogue),
-    cl::Hidden,
+    cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden,
     cl::desc("Tail-folding and predication preferences over creating a scalar "
              "epilogue loop."),
-    cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
-                         "scalar-epilogue",
-                         "Don't tail-predicate loops, create scalar epilogue"),
-              clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
-                         "predicate-else-scalar-epilogue",
-                         "prefer tail-folding, create scalar epilogue if tail "
-                         "folding fails."),
-              clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
-                         "predicate-dont-vectorize",
-                         "prefers tail-folding, don't attempt vectorization if "
-                         "tail-folding fails."),
-              clEnumValN(PreferPredicateTy::PredicatedEpilogue,
-                         "predicated-epilogue",
-                         "prefers predicated vector epilogues, falling back on "
-                         "scalar epilogues if it fails.")));
+    cl::values(
+        clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue",
+                   "Don't tail-predicate loops, create scalar epilogue"),
+        clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
+                   "predicate-else-scalar-epilogue",
+                   "prefer tail-folding, create scalar epilogue if tail "
+                   "folding fails."),
+        clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
+                   "predicate-dont-vectorize",
+                   "prefers tail-folding, don't attempt vectorization if "
+                   "tail-folding fails."),
+        clEnumValN(PreferPredicateTy::PredicatedEpilogue, "predicated-epilogue",
+                   "prefers predicated vector epilogues, falling back on "
+                   "scalar epilogues if it fails.")));
 
 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
     "force-tail-folding-style", cl::desc("Force the tail folding style"),
@@ -672,18 +670,15 @@ protected:
 /// epilogues.
 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
 public:
-  EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
-                             LoopInfo *LI, DominatorTree *DT,
-                             const TargetTransformInfo *TTI,
-                             AssumptionCache *AC,
-                             EpilogueLoopVectorizationInfo &EPI,
-                             LoopVectorizationCostModel *CM,
-                             GeneratedRTChecks &Check, VPlan &Plan,
-                             bool isEpilogueTailFolded)
+  EpilogueVectorizerMainLoop(
+      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
+      DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
+      EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
+      GeneratedRTChecks &Check, VPlan &Plan, bool isEpilogueTailFolded)
       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
                                        Check, Plan, EPI.MainLoopVF,
                                        EPI.MainLoopVF, EPI.MainLoopUF,
-                                        isEpilogueTailFolded) {}
+                                       isEpilogueTailFolded) {}
   /// Implements the interface for creating a vectorized skeleton using the
   /// *main loop* strategy (i.e., the first pass of VPlan execution).
   BasicBlock *createVectorizedLoopSkeleton() final;
@@ -712,14 +707,11 @@ protected:
 // their epilogues.
 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
 public:
-  EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
-                                 LoopInfo *LI, DominatorTree *DT,
-                                 const TargetTransformInfo *TTI,
-                                 AssumptionCache *AC,
-                                 EpilogueLoopVectorizationInfo &EPI,
-                                 LoopVectorizationCostModel *CM,
-                                 GeneratedRTChecks &Checks, VPlan &Plan,
-                                 bool isEpilogueTailFolded)
+  EpilogueVectorizerEpilogueLoop(
+      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
+      DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
+      EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
+      GeneratedRTChecks &Checks, VPlan &Plan, bool isEpilogueTailFolded)
       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
                                        Checks, Plan, EPI.EpilogueVF,
                                        EPI.EpilogueVF, EPI.EpilogueUF,
@@ -1339,12 +1331,14 @@ public:
     // the cost model may still decide it's not worth it and should fall back
     // on a scalar epilogue.
     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
-           ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicatedEpilogue;
+           ScalarEpilogueStatus ==
+               CM_ScalarEpilogueNotNeededUsePredicatedEpilogue;
   }
 
   /// Returns true if tail-folding is preferred over a scalar epilogue.
   bool preferPredicatedLoop() const {
-    return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicatedBody ||
+    return ScalarEpilogueStatus ==
+               CM_ScalarEpilogueNotNeededUsePredicatedBody ||
            ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
   }
 
@@ -1362,7 +1356,7 @@ public:
   /// \param UserIC User specific interleave count.
   void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
     // TODO: Should probably have separate style for the main and epilogue.
-    
+
     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
     if (!Legal->canFoldTailByMasking()) {
       ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
@@ -3690,7 +3684,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     // none were taken so far.
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
   }
-  bool TailFoldedIntoMainBody = 
+  bool TailFoldedIntoMainBody =
       ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicatedEpilogue;
   FixedScalableVFPair MaxFactors =
       computeFeasibleMaxVF(MaxTC, UserVF, UserIC, TailFoldedIntoMainBody);
@@ -4536,7 +4530,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
   }
 
   if (!EpilogueTailFoldedCM &&
-    !CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
+      !CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
                          "this loop\n");
     return Result;
@@ -4596,7 +4590,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
   for (auto &NextVF : ProfitableVFs) {
     // Skip candidate VFs without a corresponding VPlan.
     if (!hasPlanWithVF(NextVF.Width,
-                      /*ForEpilogue*/ true))
+                       /*ForEpilogue*/ true))
       continue;
 
     // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
@@ -4605,7 +4599,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
          ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
         (NextVF.Width.isScalable() &&
-        ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
+         ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
         (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
          ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
       continue;
@@ -4638,8 +4632,10 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     // TODO: Need to update last flag passed to isMoreProfitable.
     if (Result.Width.isScalar() ||
         isMoreProfitable(NextVF, Result, MaxTripCount,
-                        EpilogueTailFoldedCM ? !EpilogueTailFoldedCM->foldTailByMasking() :
-                        !CM.foldTailByMasking(), /*IsEpilogue*/ true))
+                         EpilogueTailFoldedCM
+                             ? !EpilogueTailFoldedCM->foldTailByMasking()
+                             : !CM.foldTailByMasking(),
+                         /*IsEpilogue*/ true))
       Result = NextVF;
   }
 
@@ -6912,28 +6908,32 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
     //    dbgs() << "Preparing additional predicated version of cost model\n";
     EpilogueTailFoldedCM->collectValuesToIgnore();
     EpilogueTailFoldedCM->collectElementTypesForWidening();
-    EpilogueTailFoldedCM->setTailFoldingStyles(MaxFactors.ScalableVF.isNonZero(), UserIC);
+    EpilogueTailFoldedCM->setTailFoldingStyles(
+        MaxFactors.ScalableVF.isNonZero(), UserIC);
     // If the max VF is likely to be 2, then there probably isn't much point
     // generating a predicated vector epilogue.
     unsigned EstimatedMaxVF = MaxFactors.FixedVF.getFixedValue();
     if (MaxFactors.ScalableVF.isNonZero()) {
-      unsigned EstimatedMaxScalableVF = estimateElementCount(MaxFactors.ScalableVF, CM.getVScaleForTuning());
+      unsigned EstimatedMaxScalableVF =
+          estimateElementCount(MaxFactors.ScalableVF, CM.getVScaleForTuning());
       EstimatedMaxVF = std::max(EstimatedMaxVF, EstimatedMaxScalableVF);
     }
-    if (EpilogueTailFoldedCM->foldTailByMasking() && !EpilogueTailFoldedCM->InterleaveInfo.hasGroups() && EstimatedMaxVF > 2) {
+    if (EpilogueTailFoldedCM->foldTailByMasking() &&
+        !EpilogueTailFoldedCM->InterleaveInfo.hasGroups() &&
+        EstimatedMaxVF > 2) {
       EpilogueTailFoldedCM->copyMinimalBitwidths(CM.getMinimalBitwidths());
 
       // Invalidate interleave groups if all blocks of loop will be predicated.
-      if (EpilogueTailFoldedCM->blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
+      if (EpilogueTailFoldedCM->blockNeedsPredicationForAnyReason(
+              OrigLoop->getHeader()) &&
           !useMaskedInterleavedAccesses(TTI)) {
-        LLVM_DEBUG(
-            dbgs()
-            << "LV: Invalidate all interleaved groups due to fold-tail by masking "
-               "which requires masked-interleaved support.\n");
+        LLVM_DEBUG(dbgs() << "LV: Invalidate all interleaved groups due to "
+                             "fold-tail by masking "
+                             "which requires masked-interleaved support.\n");
         if (EpilogueTailFoldedCM->InterleaveInfo.invalidateGroups())
-          // Invalidating interleave groups also requires invalidating all decisions
-          // based on them, which includes widening decisions and uniform and scalar
-          // values.
+          // Invalidating interleave groups also requires invalidating all
+          // decisions based on them, which includes widening decisions and
+          // uniform and scalar values.
           EpilogueTailFoldedCM->invalidateCostModelingDecisions();
       }
     } else
@@ -6941,7 +6941,8 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   }
 
   // TODO: Does this only apply for predicated main body?
-  if (CM.foldTailByMasking() || (EpilogueTailFoldedCM && EpilogueTailFoldedCM->foldTailByMasking()))
+  if (CM.foldTailByMasking() ||
+      (EpilogueTailFoldedCM && EpilogueTailFoldedCM->foldTailByMasking()))
     Legal->prepareToFoldTailByMasking();
 
   ElementCount MaxUserVF =
@@ -7421,55 +7422,59 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
     }
   }
 
-if (EpilogueTailFoldedCM) {
-  // Find profitable VFs for vector epilogue.
-  ProfitableVFs.clear();
-
-  for (auto &P : EpilogueTailFoldedPlans) {
-    ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
-                               P->vectorFactors().end());
+  if (EpilogueTailFoldedCM) {
+    // Find profitable VFs for vector epilogue.
+    ProfitableVFs.clear();
+
+    for (auto &P : EpilogueTailFoldedPlans) {
+      ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
+                                 P->vectorFactors().end());
+
+      SmallVector<VPRegisterUsage, 8> RUs;
+      if (EpilogueTailFoldedCM->useMaxBandwidth(
+              TargetTransformInfo::RGK_ScalableVector) ||
+          EpilogueTailFoldedCM->useMaxBandwidth(
+              TargetTransformInfo::RGK_FixedWidthVector))
+        RUs = calculateRegisterUsageForPlan(
+            *P, VFs, TTI, EpilogueTailFoldedCM->ValuesToIgnore);
+
+      for (unsigned I = 0; I < VFs.size(); I++) {
+        ElementCount VF = VFs[I];
+        if (VF.isScalar())
+          continue;
+        if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
+          LLVM_DEBUG(
+              dbgs()
+              << "LV: Not considering vector loop of width " << VF
+              << " because it will not generate any vector instructions.\n");
+          continue;
+        }
+        if (EpilogueTailFoldedCM->OptForSize && !ForceVectorization &&
+            hasReplicatorRegion(*P)) {
+          LLVM_DEBUG(
+              dbgs()
+              << "LV: Not considering vector loop of width " << VF
+              << " because it would cause replicated blocks to be generated,"
+              << " which isn't allowed when optimizing for size.\n");
+          continue;
+        }
 
-    SmallVector<VPRegisterUsage, 8> RUs;
-    if (EpilogueTailFoldedCM->useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
-        EpilogueTailFoldedCM->useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
-      RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, EpilogueTailFoldedCM->ValuesToIgnore);
+        InstructionCost Cost = cost(*P, VF);
+        VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
 
-    for (unsigned I = 0; I < VFs.size(); I++) {
-      ElementCount VF = VFs[I];
-      if (VF.isScalar())
-        continue;
-      if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
-        LLVM_DEBUG(
-            dbgs()
-            << "LV: Not considering vector loop of width " << VF
-            << " because it will not generate any vector instructions.\n");
-        continue;
-      }
-      if (EpilogueTailFoldedCM->OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
-        LLVM_DEBUG(
-            dbgs()
-            << "LV: Not considering vector loop of width " << VF
-            << " because it would cause replicated blocks to be generated,"
-            << " which isn't allowed when optimizing for size.\n");
-        continue;
-      }
-
-      InstructionCost Cost = cost(*P, VF);
-      VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
+        if (EpilogueTailFoldedCM->shouldConsiderRegPressureForVF(VF) &&
+            RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
+          LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
+                            << VF << " because it uses too many registers\n");
+          continue;
+        }
 
-      if (EpilogueTailFoldedCM->shouldConsiderRegPressureForVF(VF) &&
-        RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
-        LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
-                        << VF << " because it uses too many registers\n");
-        continue;
+        // If profitable add it to ProfitableVF list.
+        if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
+          ProfitableVFs.push_back(CurrentFactor);
       }
-
-       // If profitable add it to ProfitableVF list.
-      if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
-        ProfitableVFs.push_back(CurrentFactor);
     }
   }
-}
 
 #ifndef NDEBUG
   // Select the optimal vectorization factor according to the legacy cost-model.
@@ -7820,12 +7825,10 @@ BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
   }
 
   if (ForEpilogue && isEpilogueTailFolded) {
-    BranchInst &BI =
-        *BranchInst::Create(Bypass, VectorPH, Builder.getFalse());
+    BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, Builder.getFalse());
     ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
   } else {
-    BranchInst &BI =
-        *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
+    BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
     if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
       setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
     ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
@@ -8332,8 +8335,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
-    if (auto Plan = tryToBuildVPlanWithVPRecipes(&CM,
-            std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
+    if (auto Plan = tryToBuildVPlanWithVPRecipes(
+            &CM, std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange,
+            &LVer)) {
       // Now optimize the initial VPlan.
       VPlanTransforms::hoistPredicatedLoads(*Plan, PSE, OrigLoop);
       VPlanTransforms::sinkPredicatedStores(*Plan, PSE, OrigLoop);
@@ -8357,12 +8361,13 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
     if (auto Plan = tryToBuildVPlanWithVPRecipes(
-            EpilogueTailFoldedCM, std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
+            EpilogueTailFoldedCM, std::unique_ptr<VPlan>(VPlan0->duplicate()),
+            SubRange, &LVer)) {
       bool HasScalarVF = Plan->hasScalarVFOnly();
       // Now optimize the initial VPlan.
       if (!HasScalarVF)
-        RUN_VPLAN_PASS(VPlanTransforms::truncateToMinimalBitwidths,
-                                 *Plan, EpilogueTailFoldedCM->getMinimalBitwidths());
+        RUN_VPLAN_PASS(VPlanTransforms::truncateToMinimalBitwidths, *Plan,
+                       EpilogueTailFoldedCM->getMinimalBitwidths());
       RUN_VPLAN_PASS(VPlanTransforms::optimize, *Plan);
       assert(!EpilogueTailFoldedCM->foldTailWithEVL());
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
@@ -8594,7 +8599,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // for this VPlan, replace the Recipes widening its memory instructions with a
   // single VPInterleaveRecipe at its insertion point.
   RUN_VPLAN_PASS(VPlanTransforms::createInterleaveGroups, *Plan,
-                 InterleaveGroups, RecipeBuilder, Cost->isScalarEpilogueAllowed());
+                 InterleaveGroups, RecipeBuilder,
+                 Cost->isScalarEpilogueAllowed());
 
   // Replace VPValues for known constant strides.
   RUN_VPLAN_PASS(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
@@ -8943,7 +8949,8 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
 
   // 2) If set, obey the directives
   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
-    if (ForEpilogue && PreferPredicateOverEpilogue == PreferPredicateTy::PredicatedEpilogue)
+    if (ForEpilogue &&
+        PreferPredicateOverEpilogue == PreferPredicateTy::PredicatedEpilogue)
       return CM_ScalarEpilogueNotNeededUsePredicatedEpilogue;
     switch (PreferPredicateOverEpilogue) {
     case PreferPredicateTy::ScalarEpilogue:
@@ -9388,12 +9395,13 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
     Value *ResumeV = nullptr;
     if (isa<VPActiveLaneMaskPHIRecipe>(&R)) {
       // Needs extracting from the start value ActiveLaneMask instruction.
-      auto *ALM = cast<VPInstruction>(cast<VPActiveLaneMaskPHIRecipe>(&R)->getOperand(0));
+      auto *ALM = cast<VPInstruction>(
+          cast<VPActiveLaneMaskPHIRecipe>(&R)->getOperand(0));
       assert(ALM->getOpcode() == VPInstruction::ActiveLaneMask);
       assert(IVResumeVal && "must have a resume value for the canonical IV");
       VPValue *VPV = Plan.getOrAddLiveIn(IVResumeVal);
       ALM->setOperand(0, VPV);
-       continue;
+      continue;
     }
     // TODO: Move setting of resume values to prepareToExecute.
     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
@@ -9591,12 +9599,13 @@ static void fixScalarResumeValuesFromBypass(LoopVectorizationPlanner &LVP,
 // and runtime checks of the main loop, as well as updating various phis. \p
 // InstsToMove contains instructions that need to be moved to the preheader of
 // the epilogue vector loop.
-static void connectEpilogueVectorLoop(
-    LoopVectorizationPlanner &LVP,
-    VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI,
-    DominatorTree *DT, LoopVectorizationLegality &LVL,
-    DenseMap<const SCEV *, Value *> &ExpandedSCEVs, GeneratedRTChecks &Checks,
-    ArrayRef<Instruction *> InstsToMove) {
+static void
+connectEpilogueVectorLoop(LoopVectorizationPlanner &LVP, VPlan &EpiPlan,
+                          Loop *L, EpilogueLoopVectorizationInfo &EPI,
+                          DominatorTree *DT, LoopVectorizationLegality &LVL,
+                          DenseMap<const SCEV *, Value *> &ExpandedSCEVs,
+                          GeneratedRTChecks &Checks,
+                          ArrayRef<Instruction *> InstsToMove) {
   BasicBlock *VecEpilogueIterationCountCheck =
       cast<VPIRBasicBlock>(EpiPlan.getEntry())->getIRBasicBlock();
 
@@ -9674,12 +9683,13 @@ static void connectEpilogueVectorLoop(
   auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
   for (auto *I : InstsToMove)
     I->moveBefore(IP);
-  
+
   // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
   // after executing the main loop. We need to update the resume values of
   // inductions and reductions during epilogue vectorization.
-  fixScalarResumeValuesFromBypass(LVP, VecEpilogueIterationCountCheck, L, EpiPlan,
-                                  LVL, ExpandedSCEVs, EPI.VectorTripCount);
+  fixScalarResumeValuesFromBypass(LVP, VecEpilogueIterationCountCheck, L,
+                                  EpiPlan, LVL, ExpandedSCEVs,
+                                  EPI.VectorTripCount);
 }
 
 bool LoopVectorizePass::processLoop(Loop *L) {
@@ -9871,8 +9881,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   ScalarEpilogueLowering EpilogueTailFoldedSEL =
       getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL, &IAI,
                                 /*ForEpilogue*/ true);
-  LoopVectorizationCostModel EpilogueTailFoldedCM(EpilogueTailFoldedSEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
-                                 GetBFI, F, &Hints, IAI, OptForSize);
+  LoopVectorizationCostModel EpilogueTailFoldedCM(
+      EpilogueTailFoldedSEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, GetBFI,
+      F, &Hints, IAI, OptForSize);
 
   // Use the planner for vectorization.
   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
@@ -9885,8 +9896,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     UserIC = 1;
 
   // What about predicated hint?
-  if (EpilogueTailFoldedSEL == CM_ScalarEpilogueNotNeededUsePredicatedEpilogue
-      && !CM.requiresScalarEpilogue(true)) {
+  if (EpilogueTailFoldedSEL ==
+          CM_ScalarEpilogueNotNeededUsePredicatedEpilogue &&
+      !CM.requiresScalarEpilogue(true)) {
     bool HasReductions = !LVL.getReductionVars().empty();
     bool HasSelectCmpReductions =
         HasReductions &&
@@ -9974,8 +9986,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                   "Ignoring user-specified interleave count due to possibly "
                   "unsafe dependencies in the loop."};
     InterleaveLoop = false;
-  } else if (!LVP.hasPlanWithVF(VF.Width) &&
-              UserIC > 1) {
+  } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
     // Tell the user interleaving was avoided up-front, despite being explicitly
     // requested.
     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
@@ -10113,16 +10124,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
     // Second pass vectorizes the epilogue and adjusts the control flow
     // edges from the first pass.
-    EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, 
-                                             LVP.isEpilogueTailFolded() ? LVP.getEpilogueTailFoldingCM() : &CM,
-                                             Checks, BestEpiPlan, LVP.isEpilogueTailFolded());
+    EpilogueVectorizerEpilogueLoop EpilogILV(
+        L, PSE, LI, DT, TTI, AC, EPI,
+        LVP.isEpilogueTailFolded() ? LVP.getEpilogueTailFoldingCM() : &CM,
+        Checks, BestEpiPlan, LVP.isEpilogueTailFolded());
     SmallVector<Instruction *> InstsToMove = preparePlanForEpilogueVectorLoop(
-        BestEpiPlan, L, ExpandedSCEVs, EPI, LVP.isEpilogueTailFolded() ? *LVP.getEpilogueTailFoldingCM() : CM, *PSE.getSE());
+        BestEpiPlan, L, ExpandedSCEVs, EPI,
+        LVP.isEpilogueTailFolded() ? *LVP.getEpilogueTailFoldingCM() : CM,
+        *PSE.getSE());
     LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
                     true);
     if (!LVP.isEpilogueTailFolded())
-      connectEpilogueVectorLoop(LVP, BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs,
-                              Checks, InstsToMove);
+      connectEpilogueVectorLoop(LVP, BestEpiPlan, L, EPI, DT, LVL,
+                                ExpandedSCEVs, Checks, InstsToMove);
     ++LoopsEpilogueVectorized;
   } else {
     InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, Checks,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index a0dc62020..009576adc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1598,8 +1598,8 @@ VPlan &LoopVectorizationPlanner::getPlanFor(ElementCount VF,
              1 &&
          "Multiple VPlans for VF.");
 
-  for (const VPlanPtr &Plan : 
-        UseEpilogueTailFoldedPlans ? EpilogueTailFoldedPlans : VPlans) {
+  for (const VPlanPtr &Plan :
+       UseEpilogueTailFoldedPlans ? EpilogueTailFoldedPlans : VPlans) {
     if (Plan->hasVF(VF))
       return *Plan.get();
   }

``````````

</details>


https://github.com/llvm/llvm-project/pull/181401