[llvm-branch-commits] [llvm] Patch 3: [LV] Add extra CM instace for EpilogueTF (PR #202820)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Jun 9 17:55:16 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-vectorizers
Author: Hassnaa Hamdi (hassnaaHamdi)
<details>
<summary>Changes</summary>
Builds on Patches 1 and 2 to introduce a second `LoopVectorizationCostModel` instance (`EpilogueTailFoldingCM`) dedicated to the tail-folded epilogue loop, created when `-epilogue-tail-folding-policy=prefer-fold-tail` is requested.
The planner's `plan()` validates this CM (runs `computeMaxVF`, checks `foldTailByMasking()`), adds it to an `EnabledCMs` list alongside the main CM, and calls `collectNonVectorizedAndSetWideningDecisions` on both CMs for each candidate VF.
If the epilogue tail-folding CM cannot fold the tail after planning, it is discarded.
`computeBestVF` is extended to compute costs using the epilogue CM for each VF candidate (currently unused, pending predicated VPlan support — see TODOs).
Tests are added to cover the enabled and disabled epilogue tail-folding paths.
---
Full diff: https://github.com/llvm/llvm-project/pull/202820.diff
5 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h (+5-3)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+86-28)
- (added) llvm/test/Transforms/LoopVectorize/AArch64/fold-epilogue-tail-costs.ll (+47)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll (+5-5)
- (modified) llvm/test/Transforms/LoopVectorize/fold-epilogue-tail.ll (+23-5)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 8164bc5b620c5..b66267813aa20 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -850,7 +850,8 @@ class LoopVectorizationPlanner {
/// non-zero or all applicable candidate VFs otherwise. If vectorization and
/// interleaving should be avoided up-front, no plans are generated.
void plan(ElementCount UserVF, unsigned UserIC,
- LoopVectorizationCostModel &CM);
+ LoopVectorizationCostModel &CM,
+ std::optional<LoopVectorizationCostModel> &EpilogueTailFoldingCM);
/// Return the VPlan for \p VF. At the moment, there is always a single VPlan
/// for each VF.
@@ -859,8 +860,9 @@ class LoopVectorizationPlanner {
/// Compute and return the most profitable vectorization factor and the
/// corresponding best VPlan. Also collect all profitable VFs in
/// ProfitableVFs.
- std::pair<VectorizationFactor, VPlan *>
- computeBestVF(LoopVectorizationCostModel &CM);
+ std::pair<VectorizationFactor, VPlan *> computeBestVF(
+ LoopVectorizationCostModel &CM,
+ std::optional<LoopVectorizationCostModel> &EpilogueTailFoldingCM);
/// \return The desired interleave count.
/// If interleave count has been specified by metadata it will be returned.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e11d9054b6c8f..e37f806f2961c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5483,8 +5483,9 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
}
}
-void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
- LoopVectorizationCostModel &CM) {
+void LoopVectorizationPlanner::plan(
+ ElementCount UserVF, unsigned UserIC, LoopVectorizationCostModel &CM,
+ std::optional<LoopVectorizationCostModel> &EpilogueTailFoldingCM) {
CM.collectValuesToIgnore();
Config.collectElementTypesForWidening(&CM.ValuesToIgnore);
@@ -5517,22 +5518,49 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
// for later use by the cost model.
Config.computeMinimalBitwidths();
+ SmallVector<LoopVectorizationCostModel *, 2> EnabledCMs;
+ EnabledCMs.push_back(&CM);
+
+ // Make sure firstly that the epilogue of main vector loop is allowed, then
+ // check if the tail-folded epilogue feature is enabled.
+ if (CM.EpilogueLoweringStatus == CM_EpilogueAllowed &&
+ EpilogueTailFoldingCM) {
+ // To avoid redundant heavy computation, copy computed `ValuesToIgnore`
+ // and `VecValuesToIgnore` to the EpilogueTailFoldingCM as they will be
+ // same.
+ EpilogueTailFoldingCM->ValuesToIgnore.insert_range(CM.ValuesToIgnore);
+ EpilogueTailFoldingCM->VecValuesToIgnore.insert_range(CM.VecValuesToIgnore);
+
+ // After making sure that we can get valid results of computeMaxVF, make
+ // sure that tail-folding for the epilogue loop still valid.
+ if (EpilogueTailFoldingCM->computeMaxVF(UserVF, UserIC) &&
+ EpilogueTailFoldingCM->foldTailByMasking()) {
+ EnabledCMs.push_back(&*EpilogueTailFoldingCM);
+ LLVM_DEBUG(dbgs() << "LV: CM instances: " << EnabledCMs.size() << "\n");
+ }
+ }
+
// Invalidate interleave groups if all blocks of loop will be predicated.
- if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
- !useMaskedInterleavedAccesses(TTI)) {
- LLVM_DEBUG(
- dbgs()
- << "LV: Invalidate all interleaved groups due to fold-tail by masking "
- "which requires masked-interleaved support.\n");
- if (CM.InterleaveInfo.invalidateGroups())
- // Invalidating interleave groups also requires invalidating all decisions
- // based on them, which includes widening decisions and uniform and scalar
- // values.
- CM.invalidateCostModelingDecisions();
+ if (!useMaskedInterleavedAccesses(TTI)) {
+ for_each(EnabledCMs, [&](auto *CurrentCM) {
+ if (CurrentCM->blockNeedsPredicationForAnyReason(OrigLoop->getHeader())) {
+ LLVM_DEBUG(dbgs() << "LV: Invalidate all interleaved groups due to "
+ << "fold-tail by masking which requires "
+ "masked-interleaved support.\n");
+ if (CurrentCM->InterleaveInfo.invalidateGroups()) {
+ // Invalidating interleave groups also requires invalidating all
+ // decisions based on them, which includes widening decisions and
+ // uniform and scalar values.
+ CurrentCM->invalidateCostModelingDecisions();
+ }
+ }
+ });
}
- if (CM.foldTailByMasking())
- Legal->prepareToFoldTailByMasking();
+ for_each(EnabledCMs, [&](auto *CurrentCM) {
+ if (CurrentCM->foldTailByMasking())
+ Legal->prepareToFoldTailByMasking();
+ });
ElementCount MaxUserVF =
UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
@@ -5546,12 +5574,17 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
"VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
- CM.collectNonVectorizedAndSetWideningDecisions(UserVF);
+ for_each(EnabledCMs, [&](auto *CurrentCM) {
+ CurrentCM->collectNonVectorizedAndSetWideningDecisions(UserVF);
+ });
ElementCount EpilogueUserVF =
ElementCount::getFixed(EpilogueVectorizationForceVF);
if (EpilogueUserVF.isVector() &&
ElementCount::isKnownLT(EpilogueUserVF, UserVF)) {
- CM.collectNonVectorizedAndSetWideningDecisions(EpilogueUserVF);
+ for_each(EnabledCMs, [&](auto *CurrentCM) {
+ CurrentCM->collectNonVectorizedAndSetWideningDecisions(
+ EpilogueUserVF);
+ });
buildVPlans(*VPlan1, EpilogueUserVF, EpilogueUserVF, CM);
}
buildVPlans(*VPlan1, UserVF, UserVF, CM);
@@ -5582,7 +5615,9 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
for (const auto &VF : VFCandidates) {
// Collect Uniform and Scalar instructions after vectorization with VF.
- CM.collectNonVectorizedAndSetWideningDecisions(VF);
+ for_each(EnabledCMs, [&](auto *CurrentCM) {
+ CurrentCM->collectNonVectorizedAndSetWideningDecisions(VF);
+ });
}
buildVPlans(*VPlan1, ElementCount::getFixed(1), MaxFactors.FixedVF, CM);
@@ -5777,8 +5812,9 @@ LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
return Cost;
}
-std::pair<VectorizationFactor, VPlan *>
-LoopVectorizationPlanner::computeBestVF(LoopVectorizationCostModel &CM) {
+std::pair<VectorizationFactor, VPlan *> LoopVectorizationPlanner::computeBestVF(
+ LoopVectorizationCostModel &CM,
+ std::optional<LoopVectorizationCostModel> &EpilogueTailFoldingCM) {
if (VPlans.empty())
return {VectorizationFactor::Disabled(), nullptr};
// If there is a single VPlan with a single VF, return it directly.
@@ -5877,6 +5913,20 @@ LoopVectorizationPlanner::computeBestVF(LoopVectorizationCostModel &CM) {
// If profitable add it to ProfitableVF list.
if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
ProfitableVFs.push_back(CurrentFactor);
+
+ // Get the costs for the EpilogueTailFoldingCM:
+ if (EpilogueTailFoldingCM) {
+ LLVM_DEBUG(dbgs() << "LV: Predicated CM, calculate costs for VF: " << VF
+ << "\n");
+ cost(*P, VF, ConsiderRegPressure ? &RUs[I] : nullptr,
+ *EpilogueTailFoldingCM);
+ // TODO: that cost is not accurate right now as it includes costs for
+ // unpredicated vplans instead of predicated ones. That should be fixed
+ // in future work.
+
+ // TODO: consider the VF as a profitable one when we support predicated
+ // vplans.
+ }
}
}
@@ -8012,16 +8062,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, Config, PSE, Hints,
ORE);
+ std::optional<InterleavedAccessInfo> TailFoldingCMIAI;
+ std::optional<LoopVectorizationCostModel> EpilogueTailFoldingCM;
EpilogueLowering EpilogueTailLoweringStatus =
getEpilogueTailLowering(CM, L, ORE);
if (EpilogueTailLoweringStatus ==
EpilogueLowering::CM_EpilogueNotNeededFoldTail) {
- // TODO: Apply tail-folding on the vectorized epilogue loop.
- LLVM_DEBUG(dbgs() << "LV: epilogue tail-folding is not supported yet\n");
- reportVectorizationInfo(
- "The epilogue-tail-folding policy prefer-fold-tail is not supported "
- "yet, fall back to a normal epilogue",
- "UnsupportedEpilogueTailFoldingPolicy", ORE, L);
+ LLVM_DEBUG(dbgs() << "LV: epilogue tail-folding is enabled\n");
+ TailFoldingCMIAI.emplace(PSE, L, DT, LI, LVL.getLAI());
+ if (UseInterleaved)
+ TailFoldingCMIAI->analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
+ EpilogueTailFoldingCM.emplace(CM_EpilogueNotNeededFoldTail, L, PSE, LI,
+ &LVL, *TTI, TLI, AC, ORE, GetBFI, F, &Hints,
+ *TailFoldingCMIAI, Config);
}
// Get user vectorization factor and interleave count.
@@ -8035,8 +8088,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
UserIC = 1;
// Plan how to best vectorize.
- LVP.plan(UserVF, UserIC, CM);
- auto [VF, BestPlanPtr] = LVP.computeBestVF(CM);
+ LVP.plan(UserVF, UserIC, CM, EpilogueTailFoldingCM);
+ if (EpilogueTailFoldingCM && !EpilogueTailFoldingCM->foldTailByMasking()) {
+ // Tail-folding got disabled, no need for its CM instance.
+ LLVM_DEBUG(dbgs() << "LV: Epilogue Tail-folding got disabled.\n");
+ EpilogueTailFoldingCM.reset();
+ }
+ auto [VF, BestPlanPtr] = LVP.computeBestVF(CM, EpilogueTailFoldingCM);
unsigned IC = 1;
// For VPlan build stress testing of outer loops, bail after plan
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fold-epilogue-tail-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fold-epilogue-tail-costs.ll
new file mode 100644
index 0000000000000..c5d6e72f61c68
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fold-epilogue-tail-costs.ll
@@ -0,0 +1,47 @@
+; REQUIRES: asserts
+
+; RUN: opt -S -passes=loop-vectorize -epilogue-tail-folding-policy=dont-fold-tail \
+; RUN: -debug < %s 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck --check-prefix=DEFAULT-CM %s
+
+; RUN: opt -S -passes=loop-vectorize -epilogue-tail-folding-policy=prefer-fold-tail \
+; RUN: -debug < %s 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck --check-prefix=EPILOGUE-TF-CM %s
+
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; DEFAULT-CM: Cost for VF 2: 6
+; DEFAULT-CM: Cost for VF 4: 4
+; DEFAULT-CM: Cost for VF 8: 3
+; DEFAULT-CM: Cost for VF 16: 3
+
+; EPILOGUE-TF-CM: LV: epilogue tail-folding is enabled
+; EPILOGUE-TF-CM: LV: can fold tail by masking.
+; EPILOGUE-TF-CM: LV: CM instances: 2
+; EPILOGUE-TF-CM: LV: Predicated CM, calculate costs for VF: 2
+; EPILOGUE-TF-CM: Cost for VF 2: 6
+; EPILOGUE-TF-CM: LV: Predicated CM, calculate costs for VF: 4
+; EPILOGUE-TF-CM: Cost for VF 4: 11
+; EPILOGUE-TF-CM: LV: Predicated CM, calculate costs for VF: 8
+; EPILOGUE-TF-CM: Cost for VF 8: 21
+; EPILOGUE-TF-CM: LV: Predicated CM, calculate costs for VF: 16
+; EPILOGUE-TF-CM: Cost for VF 16: 41
+;
+define void @test_epilogue_tf(ptr %A, i64 %n) {
+; CHECK-LABEL: @test_epilogue_tf
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %A, i64 %iv
+ store i8 1, ptr %arrayidx, align 1
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp ne i64 %iv.next, %n
+ br i1 %exitcond, label %for.body, label %exit
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index a8016c98d8fdd..3ddfd807310a9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -11,7 +11,7 @@ define i64 @test(ptr %a, ptr %b) #0 {
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
; CHECK: Cost for VF 8: 30
-; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
; CHECK: Cost for VF 16: 56
; CHECK: LV: Selecting VF: 16
@@ -44,7 +44,7 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
; CHECK: Cost for VF 8: 30
-; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
+; CHECK: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
; CHECK: Cost for VF 16: 57
@@ -80,7 +80,7 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%index.next>, vp<{{.+}}>
; CHECK: Cost for VF 8: 17
-; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+; CHECK: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK: Cost of 1 for VF 16: EXPRESSION vp<%11> = ir<%sum> + partial.reduce.add (mul nuw nsw (ir<%1> zext to i64), (ir<%0> zext to i64))
; CHECK: Cost for VF 16: 4
@@ -117,10 +117,10 @@ define i1 @test_extra_cmp_user(ptr nocapture noundef %dst, ptr nocapture noundef
; CHECK: Cost of 1 for VF 8: EMIT vp<%cmp.n> = icmp eq ir<16>, vp<%2>
; CHECK-NEXT: Cost of 0 for VF 8: EMIT branch-on-cond vp<%cmp.n>
; CHECK: Cost for VF 8: 9
-; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK: Cost of 0 for VF 16: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
; CHECK: Cost of 0 for VF 16: WIDEN ir<%exitcond.not> = icmp eq ir<%indvars.iv.next>, ir<16>
; CHECK: Cost of 1 for VF 16: EMIT vp<%cmp.n> = icmp eq ir<16>, vp<%2>
-; CHECK-NEXT: Cost of 0 for VF 16: EMIT branch-on-cond vp<%cmp.n>
+; CHECK: Cost of 0 for VF 16: EMIT branch-on-cond vp<%cmp.n>
; CHECK: Cost for VF 16: 4
; CHECK: LV: Selecting VF: 16
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/fold-epilogue-tail.ll b/llvm/test/Transforms/LoopVectorize/fold-epilogue-tail.ll
index 590907a5aff79..8f866390a9c26 100644
--- a/llvm/test/Transforms/LoopVectorize/fold-epilogue-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/fold-epilogue-tail.ll
@@ -2,14 +2,12 @@
; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize --disable-output \
; RUN: -epilogue-tail-folding-policy=prefer-fold-tail -pass-remarks-analysis=loop-vectorize 2>&1 | FileCheck %s
-; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -enable-epilogue-vectorization=false \
-; RUN: --disable-output -epilogue-tail-folding-policy=prefer-fold-tail -pass-remarks-analysis=loop-vectorize 2>&1 \
-; RUN: | FileCheck %s --check-prefix=CHECK-DISABLED-EPILOG
define void @test_epilogue_tf(ptr %A, i64 %n) {
; CHECK-LABEL: LV: Checking a loop in 'test_epilogue_tf'
-; CHECK: LV: epilogue tail-folding is not supported yet
-; CHECK: remark: <unknown>:0:0: The epilogue-tail-folding policy prefer-fold-tail is not supported yet, fall back to a normal epilogue
+; CHECK: LV: epilogue tail-folding is enabled
+; CHECK: LV: can fold tail by masking.
+; CHECK: LV: CM instances: 2
;
entry:
br label %for.body
@@ -26,6 +24,26 @@ exit:
ret void
}
+define void @tf-got-disabled(ptr %A, i64 %n) {
+; CHECK-LABEL: LV: Checking a loop in 'tf-got-disabled'
+; CHECK: LV: epilogue tail-folding is enabled
+; CHECK: LV: Epilogue Tail-folding got disabled.
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+ store i32 1, ptr %gep, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp ne i64 %iv.next, %n
+ br i1 %exitcond, label %for.body, label %exit
+
+exit:
+ ret void
+}
+
define void @epilogue_is_disabled(ptr %a, i64 %n) {
; CHECK-DISABLED-EPILOG-LABEL: LV: Checking a loop in 'epilogue_is_disabled'
; CHECK-DISABLED-EPILOG: remark: <unknown>:0:0: Options conflict, epilogue vectorization is disallowed while epilogue tail-folding allowed!
``````````
</details>
https://github.com/llvm/llvm-project/pull/202820
More information about the llvm-branch-commits
mailing list