[llvm] [LV] Make ScalarEpilogue a LoopVectorizeHint (PR #160125)
Ramkumar Ramachandra via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 22 08:12:58 PDT 2025
https://github.com/artagnon created https://github.com/llvm/llvm-project/pull/160125
Move ScalarEpilogueLowering from LoopVectorizationCostModel into LoopVectorizeHints, as it is really a kind of hint that's dependent on the user's preferences. The patch has the side-effect of costing the scalar-epilogue correctly in some corner cases.
>From 8720c85b69a96d0f5c7f23cf7eb1687c84cc9c6f Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Mon, 22 Sep 2025 15:19:26 +0100
Subject: [PATCH] [LV] Make ScalarEpilogue a LoopVectorizeHint
Move ScalarEpilogueLowering from LoopVectorizationCostModel into
LoopVectorizeHints, as it is really a kind of hint that's dependent on
the user's preferences. The patch has the side-effect of costing the
scalar-epilogue correctly in some corner cases.
---
.../Vectorize/LoopVectorizationLegality.h | 50 +++-
.../Vectorize/LoopVectorizationLegality.cpp | 94 +++++++-
.../Transforms/Vectorize/LoopVectorize.cpp | 213 ++++--------------
.../LoopVectorize/RISCV/dead-ops-cost.ll | 4 +-
.../LoopVectorize/RISCV/strided-accesses.ll | 6 +-
5 files changed, 192 insertions(+), 175 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 405d4a742f37b..28bc90c6cf046 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -64,7 +64,8 @@ class LoopVectorizeHints {
HK_FORCE,
HK_ISVECTORIZED,
HK_PREDICATE,
- HK_SCALABLE
+ HK_SCALABLE,
+ HK_SCALAREPILOGUE
};
/// Hint - associates name and validation with the hint value.
@@ -97,6 +98,9 @@ class LoopVectorizeHints {
/// Says whether we should use fixed width or scalable vectorization.
Hint Scalable;
+ /// Hint specifying how we should lower the scalar epilogue.
+ Hint ScalarEpilogue;
+
/// Return the loop metadata prefix.
static StringRef Prefix() { return "llvm.loop."; }
@@ -121,6 +125,33 @@ class LoopVectorizeHints {
SK_PreferScalable = 1
};
+ /// Whether it is allowed to have the original scalar loop execute at least
+ /// once. This may be needed as a fallback loop in case runtime
+ /// aliasing/dependence checks fail, or to handle the tail/remainder
+ /// iterations when the trip count is unknown or doesn't divide by the VF, or
+ /// as a peel-loop to handle gaps in interleave-groups. Under optsize and when
+ /// the trip count is very small we don't allow anyiterations to execute in
+ /// the scalar loop.
+ enum ScalarEpilogueKind {
+ // The default: allowing scalar epilogues.
+ SEK_Allowed,
+
+ // Vectorization with OptForSize: don't allow epilogues.
+ SEK_NotAllowedOptSize,
+
+ // A special case of vectorisation with OptForSize: loops with a very small
+ // trip count are considered for vectorization under OptForSize, thereby
+ // making sure the cost of their loop body is dominant, free of runtime
+ // guards and scalar iteration overheads.
+ SEK_NotAllowedLowTripLoop,
+
+ // Loop hint predicate indicating an epilogue is undesired.
+ SEK_NotNeededUsePredicate,
+
+ // Directive indicating we must either tail fold or not vectorize
+ SEK_NotAllowedUsePredicate
+ };
+
LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced,
OptimizationRemarkEmitter &ORE,
const TargetTransformInfo *TTI = nullptr);
@@ -156,6 +187,21 @@ class LoopVectorizeHints {
return FK_Disabled;
return (ForceKind)Force.Value;
}
+ ScalarEpilogueKind getScalarEpilogue() const {
+ return static_cast<ScalarEpilogueKind>(ScalarEpilogue.Value);
+ }
+ bool isScalarEpilogueAllowed() const {
+ return ScalarEpilogue.Value == SEK_Allowed;
+ }
+ void setScalarEpilogue(ScalarEpilogueKind SEK) { ScalarEpilogue.Value = SEK; }
+
+ // Determine how to lower the scalar epilogue and set it. Depends on 1)
+ // optimising for minimum code-size, 2) predicate compiler options, 3) loop
+ // hints forcing predication, and 4) a TTI hook that analyses whether the loop
+ // is suitable for predication.
+ void setScalarEpilogue(ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
+ TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL,
+ InterleavedAccessInfo *IAI);
/// \return true if scalable vectorization has been explicitly disabled.
bool isScalableVectorizationDisabled() const {
@@ -196,6 +242,8 @@ class LoopVectorizeHints {
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter &ORE;
+
+ const TargetTransformInfo *TTI;
};
/// This holds vectorization requirements that must be verified late in
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index ff35db14f7094..28ed2d6edf65e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -83,6 +83,34 @@ static cl::opt<bool> EnableHistogramVectorization(
"enable-histogram-loop-vectorization", cl::init(false), cl::Hidden,
cl::desc("Enables autovectorization of some loops containing histograms"));
+// Option prefer-predicate-over-epilogue indicates that an epilogue is
+// undesired, that predication is preferred, and this lists all options. I.e.,
+// the vectorizer will try to fold the tail-loop (epilogue) into the vector body
+// and predicate the instructions accordingly. If tail-folding fails, there are
+// different fallback strategies depending on these values:
+enum class PreferPredicateTy {
+ ScalarEpilogue = 0,
+ PredicateElseScalarEpilogue,
+ PredicateOrDontVectorize
+};
+
+static cl::opt<PreferPredicateTy> PreferPredicateOverEpilogue(
+ "prefer-predicate-over-epilogue",
+ cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden,
+ cl::desc("Tail-folding and predication preferences over creating a scalar "
+ "epilogue loop."),
+ cl::values(
+ clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue",
+ "Don't tail-predicate loops, create scalar epilogue"),
+ clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
+ "predicate-else-scalar-epilogue",
+ "prefer tail-folding, create scalar epilogue if tail "
+ "folding fails."),
+ clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
+ "predicate-dont-vectorize",
+ "prefers tail-folding, don't attempt vectorization if "
+ "tail-folding fails.")));
+
/// Maximum vectorization interleave count.
static const unsigned MaxInterleaveFactor = 16;
@@ -100,6 +128,8 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
case HK_PREDICATE:
case HK_SCALABLE:
return (Val == 0 || Val == 1);
+ case HK_SCALAREPILOGUE:
+ return Val <= SEK_NotAllowedUsePredicate;
}
return false;
}
@@ -114,7 +144,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
- TheLoop(L), ORE(ORE) {
+ ScalarEpilogue("scalarepilogue", SEK_Allowed, HK_SCALAREPILOGUE),
+ TheLoop(L), ORE(ORE), TTI(TTI) {
// Populate values with existing loop metadata.
getHintsFromMetadata();
@@ -302,8 +333,8 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
return;
unsigned Val = C->getZExtValue();
- Hint *Hints[] = {&Width, &Interleave, &Force,
- &IsVectorized, &Predicate, &Scalable};
+ Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized,
+ &Predicate, &Scalable, &ScalarEpilogue};
for (auto *H : Hints) {
if (Name == H->Name) {
if (H->validate(Val))
@@ -315,6 +346,63 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
}
}
+void LoopVectorizeHints::setScalarEpilogue(ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI,
+ TargetLibraryInfo *TLI,
+ LoopVectorizationLegality &LVL,
+ InterleavedAccessInfo *IAI) {
+ // 1) OptSize takes precedence over all other options, i.e. if this is set,
+ // don't look at hints or options, and don't request a scalar epilogue.
+ // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
+ // LoopAccessInfo (due to code dependency and not being able to reliably get
+ // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
+ // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
+ // versioning when the vectorization is forced, unlike hasOptSize. So revert
+ // back to the old way and vectorize with versioning when forced. See D81345.)
+ Function *F = TheLoop->getHeader()->getParent();
+ if (F->hasOptSize() ||
+ (llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
+ PGSOQueryType::IRPass) &&
+ getForce() != LoopVectorizeHints::FK_Enabled)) {
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotAllowedOptSize;
+ return;
+ }
+
+ // 2) If set, obey the directives
+ if (PreferPredicateOverEpilogue.getNumOccurrences()) {
+ switch (PreferPredicateOverEpilogue) {
+ case PreferPredicateTy::ScalarEpilogue:
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+ return;
+ case PreferPredicateTy::PredicateElseScalarEpilogue:
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+ return;
+ case PreferPredicateTy::PredicateOrDontVectorize:
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotAllowedUsePredicate;
+ return;
+ };
+ }
+
+ // 3) If set, obey the hints
+ switch (getPredicate()) {
+ case LoopVectorizeHints::FK_Enabled:
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+ return;
+ case LoopVectorizeHints::FK_Disabled:
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+ return;
+ };
+
+ // 4) if the TTI hook indicates this is profitable, request predication.
+ TailFoldingInfo TFI(TLI, &LVL, IAI);
+ if (TTI->preferPredicateOverEpilogue(&TFI)) {
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+ return;
+ }
+
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+}
+
// Return true if the inner loop \p Lp is uniform with regard to the outer loop
// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
// executing the inner loop will execute the same iterations). This check is
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ca092dcfcb492..7ed66969dd489 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -197,37 +197,6 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
"vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
cl::desc("The maximum allowed number of runtime memory checks"));
-// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
-// that predication is preferred, and this lists all options. I.e., the
-// vectorizer will try to fold the tail-loop (epilogue) into the vector body
-// and predicate the instructions accordingly. If tail-folding fails, there are
-// different fallback strategies depending on these values:
-namespace PreferPredicateTy {
- enum Option {
- ScalarEpilogue = 0,
- PredicateElseScalarEpilogue,
- PredicateOrDontVectorize
- };
-} // namespace PreferPredicateTy
-
-static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
- "prefer-predicate-over-epilogue",
- cl::init(PreferPredicateTy::ScalarEpilogue),
- cl::Hidden,
- cl::desc("Tail-folding and predication preferences over creating a scalar "
- "epilogue loop."),
- cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
- "scalar-epilogue",
- "Don't tail-predicate loops, create scalar epilogue"),
- clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
- "predicate-else-scalar-epilogue",
- "prefer tail-folding, create scalar epilogue if tail "
- "folding fails."),
- clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
- "predicate-dont-vectorize",
- "prefers tail-folding, don't attempt vectorization if "
- "tail-folding fails.")));
-
static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
"force-tail-folding-style", cl::desc("Force the tail folding style"),
cl::init(TailFoldingStyle::None),
@@ -854,30 +823,6 @@ static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
} // end namespace llvm
namespace llvm {
-
-// Loop vectorization cost-model hints how the scalar epilogue loop should be
-// lowered.
-enum ScalarEpilogueLowering {
-
- // The default: allowing scalar epilogues.
- CM_ScalarEpilogueAllowed,
-
- // Vectorization with OptForSize: don't allow epilogues.
- CM_ScalarEpilogueNotAllowedOptSize,
-
- // A special case of vectorisation with OptForSize: loops with a very small
- // trip count are considered for vectorization under OptForSize, thereby
- // making sure the cost of their loop body is dominant, free of runtime
- // guards and scalar iteration overheads.
- CM_ScalarEpilogueNotAllowedLowTripLoop,
-
- // Loop hint predicate indicating an epilogue is undesired.
- CM_ScalarEpilogueNotNeededUsePredicate,
-
- // Directive indicating we must either tail fold or not vectorize
- CM_ScalarEpilogueNotAllowedUsePredicate
-};
-
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
/// In many cases vectorization is not profitable. This can happen because of
@@ -889,19 +834,17 @@ class LoopVectorizationCostModel {
friend class LoopVectorizationPlanner;
public:
- LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
- PredicatedScalarEvolution &PSE, LoopInfo *LI,
- LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, DemandedBits *DB,
AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, const Function *F,
- const LoopVectorizeHints *Hints,
+ LoopVectorizeHints &Hints,
InterleavedAccessInfo &IAI,
ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
- : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
- TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
- Hints(Hints), InterleaveInfo(IAI) {
+ : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
+ AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {
if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
initializeVScaleForTuning();
CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
@@ -970,7 +913,7 @@ class LoopVectorizationCostModel {
/// the IsOrdered flag of RdxDesc is set and we do not allow reordering
/// of FP operations.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
- return !Hints->allowReordering() && RdxDesc.isOrdered();
+ return !Hints.allowReordering() && RdxDesc.isOrdered();
}
/// \returns The smallest bitwidth each instruction can be represented with.
@@ -1280,7 +1223,7 @@ class LoopVectorizationCostModel {
/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
bool requiresScalarEpilogue(bool IsVectorizing) const {
- if (!isScalarEpilogueAllowed()) {
+ if (!Hints.isScalarEpilogueAllowed()) {
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
return false;
}
@@ -1301,12 +1244,6 @@ class LoopVectorizationCostModel {
return false;
}
- /// Returns true if a scalar epilogue is not allowed due to optsize or a
- /// loop hint annotation.
- bool isScalarEpilogueAllowed() const {
- return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
- }
-
/// Returns the TailFoldingStyle that is best for the current loop.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
if (!ChosenTailFoldingStyle)
@@ -1345,8 +1282,9 @@ class LoopVectorizationCostModel {
return;
// If for some reason EVL mode is unsupported, fallback to a scalar epilogue
// if it's allowed, or DataWithoutLaneMask otherwise.
- if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
- ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
+ if (Hints.isScalarEpilogueAllowed() ||
+ Hints.getScalarEpilogue() ==
+ LoopVectorizeHints::SEK_NotNeededUsePredicate)
ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
else
ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
@@ -1558,15 +1496,6 @@ class LoopVectorizationCostModel {
DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
PredicatedBBsAfterVectorization;
- /// Records whether it is allowed to have the original scalar loop execute at
- /// least once. This may be needed as a fallback loop in case runtime
- /// aliasing/dependence checks fail, or to handle the tail/remainder
- /// iterations when the trip count is unknown or doesn't divide by the VF,
- /// or as a peel-loop to handle gaps in interleave-groups.
- /// Under optsize and when the trip count is very small we don't allow any
- /// iterations to execute in the scalar loop.
- ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
-
/// Control finally chosen tail folding style. The first element is used if
/// the IV update may overflow, the second element - if it does not.
std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
@@ -1713,8 +1642,8 @@ class LoopVectorizationCostModel {
const Function *TheFunction;
- /// Loop Vectorize Hint.
- const LoopVectorizeHints *Hints;
+ /// Loop Vectorize Hints.
+ LoopVectorizeHints &Hints;
/// The interleave access information contains groups of interleaved accesses
/// with the same stride and close to each other.
@@ -2976,7 +2905,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
Legal->isMaskRequired(I);
bool LoadAccessWithGapsRequiresEpilogMasking =
isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
- !isScalarEpilogueAllowed();
+ !Hints.isScalarEpilogueAllowed();
bool StoreAccessWithGapsRequiresMasking =
isa<StoreInst>(I) && !Group->isFull();
if (!PredicatedAccessRequiresMasking &&
@@ -3313,7 +3242,7 @@ bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
return false;
- if (Hints->isScalableVectorizationDisabled()) {
+ if (Hints.isScalableVectorizationDisabled()) {
reportVectorizationInfo("Scalable vectorization is explicitly disabled",
"ScalableVectorizationDisabled", ORE, TheLoop);
return false;
@@ -3538,21 +3467,21 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return FixedScalableVFPair::getNone();
}
- switch (ScalarEpilogueStatus) {
- case CM_ScalarEpilogueAllowed:
+ switch (Hints.getScalarEpilogue()) {
+ case LoopVectorizeHints::SEK_Allowed:
return computeFeasibleMaxVF(MaxTC, UserVF, false);
- case CM_ScalarEpilogueNotAllowedUsePredicate:
+ case LoopVectorizeHints::SEK_NotAllowedUsePredicate:
[[fallthrough]];
- case CM_ScalarEpilogueNotNeededUsePredicate:
+ case LoopVectorizeHints::SEK_NotNeededUsePredicate:
LLVM_DEBUG(
dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n");
break;
- case CM_ScalarEpilogueNotAllowedLowTripLoop:
+ case LoopVectorizeHints::SEK_NotAllowedLowTripLoop:
// fallthrough as a special case of OptForSize
- case CM_ScalarEpilogueNotAllowedOptSize:
- if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
+ case LoopVectorizeHints::SEK_NotAllowedOptSize:
+ if (Hints.getScalarEpilogue() == LoopVectorizeHints::SEK_NotAllowedOptSize)
LLVM_DEBUG(
dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
else
@@ -3636,7 +3565,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// If we have a low-trip-count, and the fixed-width VF is known to divide
// the trip count but the scalable factor does not, use the fixed-width
// factor in preference to allow the generation of a non-predicated loop.
- if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
+ if (Hints.getScalarEpilogue() ==
+ LoopVectorizeHints::SEK_NotAllowedLowTripLoop &&
NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
"remain for any chosen VF.\n");
@@ -3677,14 +3607,16 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// If there was a tail-folding hint/switch, but we can't fold the tail by
// masking, fallback to a vectorization with a scalar epilogue.
- if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+ if (Hints.getScalarEpilogue() ==
+ LoopVectorizeHints::SEK_NotNeededUsePredicate) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n");
- ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+ Hints.setScalarEpilogue(LoopVectorizeHints::SEK_Allowed);
return MaxFactors;
}
- if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
+ if (Hints.getScalarEpilogue() ==
+ LoopVectorizeHints::SEK_NotAllowedUsePredicate) {
LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
return FixedScalableVFPair::getNone();
}
@@ -4344,7 +4276,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
return Result;
}
- if (!CM.isScalarEpilogueAllowed()) {
+ if (!Hints.isScalarEpilogueAllowed()) {
LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
"epilogue is allowed.\n");
return Result;
@@ -4538,7 +4470,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
// 3. We don't interleave if we think that we will spill registers to memory
// due to the increased register pressure.
- if (!CM.isScalarEpilogueAllowed())
+ if (!Hints.isScalarEpilogueAllowed())
return 1;
if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
@@ -5288,7 +5220,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
// Calculate the cost of the whole interleaved group.
bool UseMaskForGaps =
- (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
+ (Group->requiresScalarEpilogue() && !Hints.isScalarEpilogueAllowed()) ||
(isa<StoreInst>(I) && !Group->isFull());
InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
@@ -8718,7 +8650,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// single VPInterleaveRecipe at its insertion point.
VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan,
InterleaveGroups, RecipeBuilder,
- CM.isScalarEpilogueAllowed());
+ Hints.isScalarEpilogueAllowed());
// Replace VPValues for known constant strides.
VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
@@ -9155,7 +9087,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) {
assert((!CM.OptForSize ||
- CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
+ Hints.getForce() == LoopVectorizeHints::FK_Enabled) &&
"Cannot SCEV check stride or overflow when optimizing for size");
VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
HasBranchWeights);
@@ -9169,7 +9101,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
if (CM.OptForSize) {
assert(
- CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
+ Hints.getForce() == LoopVectorizeHints::FK_Enabled &&
"Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.");
ORE->emit([&]() {
@@ -9227,55 +9159,6 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.set(this, DerivedIV, VPLane(0));
}
-// Determine how to lower the scalar epilogue, which depends on 1) optimising
-// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
-// predication, and 4) a TTI hook that analyses whether the loop is suitable
-// for predication.
-static ScalarEpilogueLowering getScalarEpilogueLowering(
- Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
- BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
- LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
- // 1) OptSize takes precedence over all other options, i.e. if this is set,
- // don't look at hints or options, and don't request a scalar epilogue.
- // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
- // LoopAccessInfo (due to code dependency and not being able to reliably get
- // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
- // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
- // versioning when the vectorization is forced, unlike hasOptSize. So revert
- // back to the old way and vectorize with versioning when forced. See D81345.)
- if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
- PGSOQueryType::IRPass) &&
- Hints.getForce() != LoopVectorizeHints::FK_Enabled))
- return CM_ScalarEpilogueNotAllowedOptSize;
-
- // 2) If set, obey the directives
- if (PreferPredicateOverEpilogue.getNumOccurrences()) {
- switch (PreferPredicateOverEpilogue) {
- case PreferPredicateTy::ScalarEpilogue:
- return CM_ScalarEpilogueAllowed;
- case PreferPredicateTy::PredicateElseScalarEpilogue:
- return CM_ScalarEpilogueNotNeededUsePredicate;
- case PreferPredicateTy::PredicateOrDontVectorize:
- return CM_ScalarEpilogueNotAllowedUsePredicate;
- };
- }
-
- // 3) If set, obey the hints
- switch (Hints.getPredicate()) {
- case LoopVectorizeHints::FK_Enabled:
- return CM_ScalarEpilogueNotNeededUsePredicate;
- case LoopVectorizeHints::FK_Disabled:
- return CM_ScalarEpilogueAllowed;
- };
-
- // 4) if the TTI hook indicates this is profitable, request predication.
- TailFoldingInfo TFI(TLI, &LVL, IAI);
- if (TTI->preferPredicateOverEpilogue(&TFI))
- return CM_ScalarEpilogueNotNeededUsePredicate;
-
- return CM_ScalarEpilogueAllowed;
-}
-
// Process the loop in the VPlan-native vectorization path. This path builds
// VPlan upfront in the vectorization pipeline, which allows to apply
// VPlan-to-VPlan transformations from the very beginning without modifying the
@@ -9296,11 +9179,9 @@ static bool processLoopInVPlanNativePath(
Function *F = L->getHeader()->getParent();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
- ScalarEpilogueLowering SEL =
- getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
-
- LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
- &Hints, IAI, PSI, BFI);
+ Hints.setScalarEpilogue(PSI, BFI, TLI, *LVL, &IAI);
+ LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+ Hints, IAI, PSI, BFI);
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
@@ -9421,7 +9302,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
VectorizationFactor &VF, Loop *L,
PredicatedScalarEvolution &PSE,
VPCostContext &CostCtx, VPlan &Plan,
- ScalarEpilogueLowering SEL,
+ const LoopVectorizeHints &Hints,
std::optional<unsigned> VScale) {
InstructionCost TotalCost = Checks.getCost();
if (!TotalCost.isValid())
@@ -9498,7 +9379,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
// epilogue is allowed, choose the next closest multiple of VF. This should
// partly compensate for ignoring the epilogue cost.
uint64_t MinTC = std::max(MinTC1, MinTC2);
- if (SEL == CM_ScalarEpilogueAllowed)
+ if (Hints.isScalarEpilogueAllowed())
MinTC = alignTo(MinTC, IntVF);
VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
@@ -9926,8 +9807,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
- ScalarEpilogueLowering SEL =
- getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
+ Hints.setScalarEpilogue(PSI, BFI, TLI, LVL, &IAI);
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
@@ -9943,12 +9823,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LLVM_DEBUG(dbgs() << "\n");
// Predicate tail-folded loops are efficient even when the loop
// iteration count is low. However, setting the epilogue policy to
- // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
+ // `SEK_NotAllowedLowTripLoop` prevents vectorizing loops
// with runtime checks. It's more effective to let
// `isOutsideLoopWorkProfitable` determine if vectorization is
// beneficial for the loop.
- if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
- SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+ if (Hints.getScalarEpilogue() !=
+ LoopVectorizeHints::SEK_NotNeededUsePredicate)
+ Hints.setScalarEpilogue(LoopVectorizeHints::SEK_NotAllowedLowTripLoop);
}
}
@@ -9999,8 +9880,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
}
// Use the cost model.
- LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
- F, &Hints, IAI, PSI, BFI);
+ LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
+ Hints, IAI, PSI, BFI);
// Use the planner for vectorization.
LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
ORE);
@@ -10046,7 +9927,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
CM.CostKind);
if (!ForceVectorization &&
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
- LVP.getPlanFor(VF.Width), SEL,
+ LVP.getPlanFor(VF.Width), Hints,
CM.getVScaleForTuning())) {
ORE->emit([&]() {
return OptimizationRemarkAnalysisAliasing(
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 65373e4128b7f..49cc0b7ad5def 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -83,7 +83,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 6)
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 8)
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
@@ -168,7 +168,7 @@ define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0
; CHECK-NEXT: [[TMP1:%.*]] = sub i32 770, [[UMAX3]]
; CHECK-NEXT: [[SMAX4:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0)
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[SMAX4]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP2]], 19
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP2]], 24
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 4c84913eea23d..e7df49c9062e2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -706,7 +706,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; NOSTRIDED-UF2-NEXT: [[P21:%.*]] = ptrtoint ptr [[P2:%.*]] to i64
; NOSTRIDED-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; NOSTRIDED-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
-; NOSTRIDED-UF2-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 12)
+; NOSTRIDED-UF2-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 16)
; NOSTRIDED-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]]
; NOSTRIDED-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; NOSTRIDED-UF2: vector.scevcheck:
@@ -862,7 +862,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-UF2-NEXT: entry:
; STRIDED-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; STRIDED-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
-; STRIDED-UF2-NEXT: [[UMAX9:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 79)
+; STRIDED-UF2-NEXT: [[UMAX9:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 80)
; STRIDED-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX9]]
; STRIDED-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; STRIDED-UF2: vector.scevcheck:
@@ -1224,7 +1224,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
; STRIDED-UF2-NEXT: entry:
; STRIDED-UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; STRIDED-UF2-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
-; STRIDED-UF2-NEXT: [[UMAX6:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 28)
+; STRIDED-UF2-NEXT: [[UMAX6:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 32)
; STRIDED-UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX6]]
; STRIDED-UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; STRIDED-UF2: vector.memcheck:
More information about the llvm-commits
mailing list