[llvm-branch-commits] [llvm] Patch 3: [LV] Add extra CM instace for EpilogueTF (PR #202820)

Tue Jun 9 17:55:16 PDT 2026

llvmorg-github-actions[bot] wrote:




@llvm/pr-subscribers-vectorizers

Author: Hassnaa Hamdi (hassnaaHamdi)

<details>
<summary>Changes</summary>

Builds on Patches 1 and 2 to introduce a second `LoopVectorizationCostModel` instance (`EpilogueTailFoldingCM`) dedicated to the tail-folded epilogue loop, created when `-epilogue-tail-folding-policy=prefer-fold-tail` is requested.  
The planner's `plan()` validates this CM (runs `computeMaxVF`, checks `foldTailByMasking()`), adds it to an `EnabledCMs` list alongside the main CM, and calls `collectNonVectorizedAndSetWideningDecisions` on both CMs for each candidate VF.  
If the epilogue tail-folding CM cannot fold the tail after planning, it is discarded.  
`computeBestVF` is extended to compute costs using the epilogue CM for each VF candidate (currently unused, pending predicated VPlan support — see TODOs).  
Tests are added to cover the enabled and disabled epilogue tail-folding paths.

---
Full diff: https://github.com/llvm/llvm-project/pull/202820.diff


5 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h (+5-3) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+86-28) 
- (added) llvm/test/Transforms/LoopVectorize/AArch64/fold-epilogue-tail-costs.ll (+47) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll (+5-5) 
- (modified) llvm/test/Transforms/LoopVectorize/fold-epilogue-tail.ll (+23-5) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 8164bc5b620c5..b66267813aa20 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -850,7 +850,8 @@ class LoopVectorizationPlanner {
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
   /// interleaving should be avoided up-front, no plans are generated.
   void plan(ElementCount UserVF, unsigned UserIC,
-            LoopVectorizationCostModel &CM);
+            LoopVectorizationCostModel &CM,
+            std::optional<LoopVectorizationCostModel> &EpilogueTailFoldingCM);
 
   /// Return the VPlan for \p VF. At the moment, there is always a single VPlan
   /// for each VF.
@@ -859,8 +860,9 @@ class LoopVectorizationPlanner {
   /// Compute and return the most profitable vectorization factor and the
   /// corresponding best VPlan. Also collect all profitable VFs in
   /// ProfitableVFs.
-  std::pair<VectorizationFactor, VPlan *>
-  computeBestVF(LoopVectorizationCostModel &CM);
+  std::pair<VectorizationFactor, VPlan *> computeBestVF(
+      LoopVectorizationCostModel &CM,
+      std::optional<LoopVectorizationCostModel> &EpilogueTailFoldingCM);
 
   /// \return The desired interleave count.
   /// If interleave count has been specified by metadata it will be returned.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e11d9054b6c8f..e37f806f2961c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5483,8 +5483,9 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
   }
 }
 
-void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
-                                    LoopVectorizationCostModel &CM) {
+void LoopVectorizationPlanner::plan(
+    ElementCount UserVF, unsigned UserIC, LoopVectorizationCostModel &CM,
+    std::optional<LoopVectorizationCostModel> &EpilogueTailFoldingCM) {
   CM.collectValuesToIgnore();
   Config.collectElementTypesForWidening(&CM.ValuesToIgnore);
 
@@ -5517,22 +5518,49 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
   // for later use by the cost model.
   Config.computeMinimalBitwidths();
 
+  SmallVector<LoopVectorizationCostModel *, 2> EnabledCMs;
+  EnabledCMs.push_back(&CM);
+
+  // Make sure firstly that the epilogue of main vector loop is allowed, then
+  // check if the tail-folded epilogue feature is enabled.
+  if (CM.EpilogueLoweringStatus == CM_EpilogueAllowed &&
+      EpilogueTailFoldingCM) {
+    // To avoid redundant heavy computation, copy computed `ValuesToIgnore`
+    // and `VecValuesToIgnore` to the EpilogueTailFoldingCM as they will be
+    // same.
+    EpilogueTailFoldingCM->ValuesToIgnore.insert_range(CM.ValuesToIgnore);
+    EpilogueTailFoldingCM->VecValuesToIgnore.insert_range(CM.VecValuesToIgnore);
+
+    // After making sure that we can get valid results of computeMaxVF, make
+    // sure that tail-folding for the epilogue loop still valid.
+    if (EpilogueTailFoldingCM->computeMaxVF(UserVF, UserIC) &&
+        EpilogueTailFoldingCM->foldTailByMasking()) {
+      EnabledCMs.push_back(&*EpilogueTailFoldingCM);
+      LLVM_DEBUG(dbgs() << "LV: CM instances: " << EnabledCMs.size() << "\n");
+    }
+  }
+
   // Invalidate interleave groups if all blocks of loop will be predicated.
-  if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
-      !useMaskedInterleavedAccesses(TTI)) {
-    LLVM_DEBUG(
-        dbgs()
-        << "LV: Invalidate all interleaved groups due to fold-tail by masking "
-           "which requires masked-interleaved support.\n");
-    if (CM.InterleaveInfo.invalidateGroups())
-      // Invalidating interleave groups also requires invalidating all decisions
-      // based on them, which includes widening decisions and uniform and scalar
-      // values.
-      CM.invalidateCostModelingDecisions();
+  if (!useMaskedInterleavedAccesses(TTI)) {
+    for_each(EnabledCMs, [&](auto *CurrentCM) {
+      if (CurrentCM->blockNeedsPredicationForAnyReason(OrigLoop->getHeader())) {
+        LLVM_DEBUG(dbgs() << "LV: Invalidate all interleaved groups due to "
+                          << "fold-tail by masking which requires "
+                             "masked-interleaved support.\n");
+        if (CurrentCM->InterleaveInfo.invalidateGroups()) {
+          // Invalidating interleave groups also requires invalidating all
+          // decisions based on them, which includes widening decisions and
+          // uniform and scalar values.
+          CurrentCM->invalidateCostModelingDecisions();
+        }
+      }
+    });
   }
 
-  if (CM.foldTailByMasking())
-    Legal->prepareToFoldTailByMasking();
+  for_each(EnabledCMs, [&](auto *CurrentCM) {
+    if (CurrentCM->foldTailByMasking())
+      Legal->prepareToFoldTailByMasking();
+  });
 
   ElementCount MaxUserVF =
       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
@@ -5546,12 +5574,17 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
              "VF needs to be a power of two");
       // Collect the instructions (and their associated costs) that will be more
       // profitable to scalarize.
-      CM.collectNonVectorizedAndSetWideningDecisions(UserVF);
+      for_each(EnabledCMs, [&](auto *CurrentCM) {
+        CurrentCM->collectNonVectorizedAndSetWideningDecisions(UserVF);
+      });
       ElementCount EpilogueUserVF =
           ElementCount::getFixed(EpilogueVectorizationForceVF);
       if (EpilogueUserVF.isVector() &&
           ElementCount::isKnownLT(EpilogueUserVF, UserVF)) {
-        CM.collectNonVectorizedAndSetWideningDecisions(EpilogueUserVF);
+        for_each(EnabledCMs, [&](auto *CurrentCM) {
+          CurrentCM->collectNonVectorizedAndSetWideningDecisions(
+              EpilogueUserVF);
+        });
         buildVPlans(*VPlan1, EpilogueUserVF, EpilogueUserVF, CM);
       }
       buildVPlans(*VPlan1, UserVF, UserVF, CM);
@@ -5582,7 +5615,9 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
 
   for (const auto &VF : VFCandidates) {
     // Collect Uniform and Scalar instructions after vectorization with VF.
-    CM.collectNonVectorizedAndSetWideningDecisions(VF);
+    for_each(EnabledCMs, [&](auto *CurrentCM) {
+      CurrentCM->collectNonVectorizedAndSetWideningDecisions(VF);
+    });
   }
 
   buildVPlans(*VPlan1, ElementCount::getFixed(1), MaxFactors.FixedVF, CM);
@@ -5777,8 +5812,9 @@ LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
   return Cost;
 }
 
-std::pair<VectorizationFactor, VPlan *>
-LoopVectorizationPlanner::computeBestVF(LoopVectorizationCostModel &CM) {
+std::pair<VectorizationFactor, VPlan *> LoopVectorizationPlanner::computeBestVF(
+    LoopVectorizationCostModel &CM,
+    std::optional<LoopVectorizationCostModel> &EpilogueTailFoldingCM) {
   if (VPlans.empty())
     return {VectorizationFactor::Disabled(), nullptr};
   // If there is a single VPlan with a single VF, return it directly.
@@ -5877,6 +5913,20 @@ LoopVectorizationPlanner::computeBestVF(LoopVectorizationCostModel &CM) {
       // If profitable add it to ProfitableVF list.
       if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
         ProfitableVFs.push_back(CurrentFactor);
+
+      // Get the costs for the EpilogueTailFoldingCM:
+      if (EpilogueTailFoldingCM) {
+        LLVM_DEBUG(dbgs() << "LV: Predicated CM, calculate costs for VF: " << VF
+                          << "\n");
+        cost(*P, VF, ConsiderRegPressure ? &RUs[I] : nullptr,
+             *EpilogueTailFoldingCM);
+        // TODO: that cost is not accurate right now as it includes costs for
+        // unpredicated vplans instead of predicated ones. That should be fixed
+        // in future work.
+
+        // TODO: consider the VF as a profitable one when we support predicated
+        // vplans.
+      }
     }
   }
 
@@ -8012,16 +8062,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, Config, PSE, Hints,
                                ORE);
 
+  std::optional<InterleavedAccessInfo> TailFoldingCMIAI;
+  std::optional<LoopVectorizationCostModel> EpilogueTailFoldingCM;
   EpilogueLowering EpilogueTailLoweringStatus =
       getEpilogueTailLowering(CM, L, ORE);
   if (EpilogueTailLoweringStatus ==
       EpilogueLowering::CM_EpilogueNotNeededFoldTail) {
-    // TODO: Apply tail-folding on the vectorized epilogue loop.
-    LLVM_DEBUG(dbgs() << "LV: epilogue tail-folding is not supported yet\n");
-    reportVectorizationInfo(
-        "The epilogue-tail-folding policy prefer-fold-tail is not supported "
-        "yet, fall back to a normal epilogue",
-        "UnsupportedEpilogueTailFoldingPolicy", ORE, L);
+    LLVM_DEBUG(dbgs() << "LV: epilogue tail-folding is enabled\n");
+    TailFoldingCMIAI.emplace(PSE, L, DT, LI, LVL.getLAI());
+    if (UseInterleaved)
+      TailFoldingCMIAI->analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
+    EpilogueTailFoldingCM.emplace(CM_EpilogueNotNeededFoldTail, L, PSE, LI,
+                                  &LVL, *TTI, TLI, AC, ORE, GetBFI, F, &Hints,
+                                  *TailFoldingCMIAI, Config);
   }
 
   // Get user vectorization factor and interleave count.
@@ -8035,8 +8088,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     UserIC = 1;
 
   // Plan how to best vectorize.
-  LVP.plan(UserVF, UserIC, CM);
-  auto [VF, BestPlanPtr] = LVP.computeBestVF(CM);
+  LVP.plan(UserVF, UserIC, CM, EpilogueTailFoldingCM);
+  if (EpilogueTailFoldingCM && !EpilogueTailFoldingCM->foldTailByMasking()) {
+    // Tail-folding got disabled, no need for its CM instance.
+    LLVM_DEBUG(dbgs() << "LV: Epilogue Tail-folding got disabled.\n");
+    EpilogueTailFoldingCM.reset();
+  }
+  auto [VF, BestPlanPtr] = LVP.computeBestVF(CM, EpilogueTailFoldingCM);
   unsigned IC = 1;
 
   // For VPlan build stress testing of outer loops, bail after plan
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fold-epilogue-tail-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fold-epilogue-tail-costs.ll
new file mode 100644
index 0000000000000..c5d6e72f61c68
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fold-epilogue-tail-costs.ll
@@ -0,0 +1,47 @@
+; REQUIRES: asserts
+
+; RUN: opt -S -passes=loop-vectorize -epilogue-tail-folding-policy=dont-fold-tail \
+; RUN:   -debug < %s 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck --check-prefix=DEFAULT-CM %s
+
+; RUN: opt -S -passes=loop-vectorize -epilogue-tail-folding-policy=prefer-fold-tail \
+; RUN:   -debug < %s 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck --check-prefix=EPILOGUE-TF-CM %s
+
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; DEFAULT-CM: Cost for VF 2: 6
+; DEFAULT-CM: Cost for VF 4: 4
+; DEFAULT-CM: Cost for VF 8: 3
+; DEFAULT-CM: Cost for VF 16: 3
+
+; EPILOGUE-TF-CM: LV: epilogue tail-folding is enabled
+; EPILOGUE-TF-CM: LV: can fold tail by masking.
+; EPILOGUE-TF-CM: LV: CM instances: 2
+; EPILOGUE-TF-CM: LV: Predicated CM, calculate costs for VF: 2
+; EPILOGUE-TF-CM: Cost for VF 2: 6
+; EPILOGUE-TF-CM: LV: Predicated CM, calculate costs for VF: 4
+; EPILOGUE-TF-CM: Cost for VF 4: 11
+; EPILOGUE-TF-CM: LV: Predicated CM, calculate costs for VF: 8
+; EPILOGUE-TF-CM: Cost for VF 8: 21
+; EPILOGUE-TF-CM: LV: Predicated CM, calculate costs for VF: 16
+; EPILOGUE-TF-CM: Cost for VF 16: 41
+;
+define void @test_epilogue_tf(ptr %A, i64 %n) {
+; CHECK-LABEL: @test_epilogue_tf
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %A, i64 %iv
+  store i8 1, ptr %arrayidx, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, %n
+  br i1 %exitcond, label %for.body, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index a8016c98d8fdd..3ddfd807310a9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -11,7 +11,7 @@ define i64 @test(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
 ; CHECK: Cost for VF 8: 30
-; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
 ; CHECK: Cost for VF 16: 56
 ; CHECK: LV: Selecting VF: 16
@@ -44,7 +44,7 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
 ; CHECK: Cost for VF 8: 30
-; CHECK-NEXT: Cost of 1 for VF 16: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
+; CHECK: Cost of 1 for VF 16: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
 ; CHECK: Cost for VF 16: 57
@@ -80,7 +80,7 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
 ; CHECK: Cost for VF 8: 17
-; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK: Cost of 1 for VF 16: EXPRESSION vp<%11> = ir<%sum> + partial.reduce.add (mul nuw nsw (ir<%1> zext to i64), (ir<%0> zext to i64))
 ; CHECK: Cost for VF 16: 4
@@ -117,10 +117,10 @@ define i1 @test_extra_cmp_user(ptr nocapture noundef %dst, ptr nocapture noundef
 ; CHECK: Cost of 1 for VF 8: EMIT vp<%cmp.n> = icmp eq ir<16>, vp<%2>
 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT branch-on-cond vp<%cmp.n>
 ; CHECK: Cost for VF 8: 9
-; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK: Cost of 0 for VF 16: induction instruction   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 ; CHECK: Cost of 0 for VF 16: WIDEN ir<%exitcond.not> = icmp eq ir<%indvars.iv.next>, ir<16>
 ; CHECK: Cost of 1 for VF 16: EMIT vp<%cmp.n> = icmp eq ir<16>, vp<%2>
-; CHECK-NEXT: Cost of 0 for VF 16: EMIT branch-on-cond vp<%cmp.n>
+; CHECK: Cost of 0 for VF 16: EMIT branch-on-cond vp<%cmp.n>
 ; CHECK: Cost for VF 16: 4
 ; CHECK: LV: Selecting VF: 16
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/fold-epilogue-tail.ll b/llvm/test/Transforms/LoopVectorize/fold-epilogue-tail.ll
index 590907a5aff79..8f866390a9c26 100644
--- a/llvm/test/Transforms/LoopVectorize/fold-epilogue-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/fold-epilogue-tail.ll
@@ -2,14 +2,12 @@
 ; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize --disable-output \
 ; RUN: -epilogue-tail-folding-policy=prefer-fold-tail -pass-remarks-analysis=loop-vectorize 2>&1 | FileCheck %s
 
-; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -enable-epilogue-vectorization=false \
-; RUN: --disable-output -epilogue-tail-folding-policy=prefer-fold-tail -pass-remarks-analysis=loop-vectorize 2>&1 \
-; RUN: | FileCheck %s --check-prefix=CHECK-DISABLED-EPILOG
 
 define void @test_epilogue_tf(ptr %A, i64 %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'test_epilogue_tf'
-; CHECK: LV: epilogue tail-folding is not supported yet
-; CHECK: remark: <unknown>:0:0: The epilogue-tail-folding policy prefer-fold-tail is not supported yet, fall back to a normal epilogue
+; CHECK: LV: epilogue tail-folding is enabled
+; CHECK: LV: can fold tail by masking.
+; CHECK: LV: CM instances: 2
 ;
 entry:
   br label %for.body
@@ -26,6 +24,26 @@ exit:
   ret void
 }
 
+define void @tf-got-disabled(ptr %A, i64 %n) {
+; CHECK-LABEL: LV: Checking a loop in 'tf-got-disabled'
+; CHECK: LV: epilogue tail-folding is enabled
+; CHECK: LV: Epilogue Tail-folding got disabled.
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+  store i32 1, ptr %gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, %n
+  br i1 %exitcond, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
 define void @epilogue_is_disabled(ptr %a, i64 %n) {
 ; CHECK-DISABLED-EPILOG-LABEL: LV: Checking a loop in 'epilogue_is_disabled'
 ; CHECK-DISABLED-EPILOG: remark: <unknown>:0:0: Options conflict, epilogue vectorization is disallowed while epilogue tail-folding allowed!

``````````

</details>


https://github.com/llvm/llvm-project/pull/202820