[llvm] [LV] Make ScalarEpilogue a LoopVectorizeHint (PR #160125)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 22 08:13:30 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Ramkumar Ramachandra (artagnon)
<details>
<summary>Changes</summary>
Move ScalarEpilogueLowering from LoopVectorizationCostModel into LoopVectorizeHints, as it is really a kind of hint that's dependent on the user's preferences. The patch has the side-effect of costing the scalar-epilogue correctly in some corner cases.
---
Patch is 34.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/160125.diff
5 Files Affected:
- (modified) llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h (+49-1)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (+91-3)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+47-166)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll (+2-2)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll (+3-3)
``````````diff
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 405d4a742f37b..28bc90c6cf046 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -64,7 +64,8 @@ class LoopVectorizeHints {
HK_FORCE,
HK_ISVECTORIZED,
HK_PREDICATE,
- HK_SCALABLE
+ HK_SCALABLE,
+ HK_SCALAREPILOGUE
};
/// Hint - associates name and validation with the hint value.
@@ -97,6 +98,9 @@ class LoopVectorizeHints {
/// Says whether we should use fixed width or scalable vectorization.
Hint Scalable;
+ /// Hint specifying how we should lower the scalar epilogue.
+ Hint ScalarEpilogue;
+
/// Return the loop metadata prefix.
static StringRef Prefix() { return "llvm.loop."; }
@@ -121,6 +125,33 @@ class LoopVectorizeHints {
SK_PreferScalable = 1
};
+ /// Whether it is allowed to have the original scalar loop execute at least
+ /// once. This may be needed as a fallback loop in case runtime
+ /// aliasing/dependence checks fail, or to handle the tail/remainder
+ /// iterations when the trip count is unknown or doesn't divide by the VF, or
+ /// as a peel-loop to handle gaps in interleave-groups. Under optsize and when
+ /// the trip count is very small we don't allow anyiterations to execute in
+ /// the scalar loop.
+ enum ScalarEpilogueKind {
+ // The default: allowing scalar epilogues.
+ SEK_Allowed,
+
+ // Vectorization with OptForSize: don't allow epilogues.
+ SEK_NotAllowedOptSize,
+
+ // A special case of vectorisation with OptForSize: loops with a very small
+ // trip count are considered for vectorization under OptForSize, thereby
+ // making sure the cost of their loop body is dominant, free of runtime
+ // guards and scalar iteration overheads.
+ SEK_NotAllowedLowTripLoop,
+
+ // Loop hint predicate indicating an epilogue is undesired.
+ SEK_NotNeededUsePredicate,
+
+ // Directive indicating we must either tail fold or not vectorize
+ SEK_NotAllowedUsePredicate
+ };
+
LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced,
OptimizationRemarkEmitter &ORE,
const TargetTransformInfo *TTI = nullptr);
@@ -156,6 +187,21 @@ class LoopVectorizeHints {
return FK_Disabled;
return (ForceKind)Force.Value;
}
+ ScalarEpilogueKind getScalarEpilogue() const {
+ return static_cast<ScalarEpilogueKind>(ScalarEpilogue.Value);
+ }
+ bool isScalarEpilogueAllowed() const {
+ return ScalarEpilogue.Value == SEK_Allowed;
+ }
+ void setScalarEpilogue(ScalarEpilogueKind SEK) { ScalarEpilogue.Value = SEK; }
+
+ // Determine how to lower the scalar epilogue and set it. Depends on 1)
+ // optimising for minimum code-size, 2) predicate compiler options, 3) loop
+ // hints forcing predication, and 4) a TTI hook that analyses whether the loop
+ // is suitable for predication.
+ void setScalarEpilogue(ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
+ TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL,
+ InterleavedAccessInfo *IAI);
/// \return true if scalable vectorization has been explicitly disabled.
bool isScalableVectorizationDisabled() const {
@@ -196,6 +242,8 @@ class LoopVectorizeHints {
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter &ORE;
+
+ const TargetTransformInfo *TTI;
};
/// This holds vectorization requirements that must be verified late in
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index ff35db14f7094..28ed2d6edf65e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -83,6 +83,34 @@ static cl::opt<bool> EnableHistogramVectorization(
"enable-histogram-loop-vectorization", cl::init(false), cl::Hidden,
cl::desc("Enables autovectorization of some loops containing histograms"));
+// Option prefer-predicate-over-epilogue indicates that an epilogue is
+// undesired, that predication is preferred, and this lists all options. I.e.,
+// the vectorizer will try to fold the tail-loop (epilogue) into the vector body
+// and predicate the instructions accordingly. If tail-folding fails, there are
+// different fallback strategies depending on these values:
+enum class PreferPredicateTy {
+ ScalarEpilogue = 0,
+ PredicateElseScalarEpilogue,
+ PredicateOrDontVectorize
+};
+
+static cl::opt<PreferPredicateTy> PreferPredicateOverEpilogue(
+ "prefer-predicate-over-epilogue",
+ cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden,
+ cl::desc("Tail-folding and predication preferences over creating a scalar "
+ "epilogue loop."),
+ cl::values(
+ clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue",
+ "Don't tail-predicate loops, create scalar epilogue"),
+ clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
+ "predicate-else-scalar-epilogue",
+ "prefer tail-folding, create scalar epilogue if tail "
+ "folding fails."),
+ clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
+ "predicate-dont-vectorize",
+ "prefers tail-folding, don't attempt vectorization if "
+ "tail-folding fails.")));
+
/// Maximum vectorization interleave count.
static const unsigned MaxInterleaveFactor = 16;
@@ -100,6 +128,8 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
case HK_PREDICATE:
case HK_SCALABLE:
return (Val == 0 || Val == 1);
+ case HK_SCALAREPILOGUE:
+ return Val <= SEK_NotAllowedUsePredicate;
}
return false;
}
@@ -114,7 +144,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
- TheLoop(L), ORE(ORE) {
+ ScalarEpilogue("scalarepilogue", SEK_Allowed, HK_SCALAREPILOGUE),
+ TheLoop(L), ORE(ORE), TTI(TTI) {
// Populate values with existing loop metadata.
getHintsFromMetadata();
@@ -302,8 +333,8 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
return;
unsigned Val = C->getZExtValue();
- Hint *Hints[] = {&Width, &Interleave, &Force,
- &IsVectorized, &Predicate, &Scalable};
+ Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized,
+ &Predicate, &Scalable, &ScalarEpilogue};
for (auto *H : Hints) {
if (Name == H->Name) {
if (H->validate(Val))
@@ -315,6 +346,63 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
}
}
+void LoopVectorizeHints::setScalarEpilogue(ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI,
+ TargetLibraryInfo *TLI,
+ LoopVectorizationLegality &LVL,
+ InterleavedAccessInfo *IAI) {
+ // 1) OptSize takes precedence over all other options, i.e. if this is set,
+ // don't look at hints or options, and don't request a scalar epilogue.
+ // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
+ // LoopAccessInfo (due to code dependency and not being able to reliably get
+ // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
+ // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
+ // versioning when the vectorization is forced, unlike hasOptSize. So revert
+ // back to the old way and vectorize with versioning when forced. See D81345.)
+ Function *F = TheLoop->getHeader()->getParent();
+ if (F->hasOptSize() ||
+ (llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
+ PGSOQueryType::IRPass) &&
+ getForce() != LoopVectorizeHints::FK_Enabled)) {
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotAllowedOptSize;
+ return;
+ }
+
+ // 2) If set, obey the directives
+ if (PreferPredicateOverEpilogue.getNumOccurrences()) {
+ switch (PreferPredicateOverEpilogue) {
+ case PreferPredicateTy::ScalarEpilogue:
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+ return;
+ case PreferPredicateTy::PredicateElseScalarEpilogue:
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+ return;
+ case PreferPredicateTy::PredicateOrDontVectorize:
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotAllowedUsePredicate;
+ return;
+ };
+ }
+
+ // 3) If set, obey the hints
+ switch (getPredicate()) {
+ case LoopVectorizeHints::FK_Enabled:
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+ return;
+ case LoopVectorizeHints::FK_Disabled:
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+ return;
+ };
+
+ // 4) if the TTI hook indicates this is profitable, request predication.
+ TailFoldingInfo TFI(TLI, &LVL, IAI);
+ if (TTI->preferPredicateOverEpilogue(&TFI)) {
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+ return;
+ }
+
+ ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+}
+
// Return true if the inner loop \p Lp is uniform with regard to the outer loop
// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
// executing the inner loop will execute the same iterations). This check is
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ca092dcfcb492..7ed66969dd489 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -197,37 +197,6 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
"vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
cl::desc("The maximum allowed number of runtime memory checks"));
-// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
-// that predication is preferred, and this lists all options. I.e., the
-// vectorizer will try to fold the tail-loop (epilogue) into the vector body
-// and predicate the instructions accordingly. If tail-folding fails, there are
-// different fallback strategies depending on these values:
-namespace PreferPredicateTy {
- enum Option {
- ScalarEpilogue = 0,
- PredicateElseScalarEpilogue,
- PredicateOrDontVectorize
- };
-} // namespace PreferPredicateTy
-
-static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
- "prefer-predicate-over-epilogue",
- cl::init(PreferPredicateTy::ScalarEpilogue),
- cl::Hidden,
- cl::desc("Tail-folding and predication preferences over creating a scalar "
- "epilogue loop."),
- cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
- "scalar-epilogue",
- "Don't tail-predicate loops, create scalar epilogue"),
- clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
- "predicate-else-scalar-epilogue",
- "prefer tail-folding, create scalar epilogue if tail "
- "folding fails."),
- clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
- "predicate-dont-vectorize",
- "prefers tail-folding, don't attempt vectorization if "
- "tail-folding fails.")));
-
static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
"force-tail-folding-style", cl::desc("Force the tail folding style"),
cl::init(TailFoldingStyle::None),
@@ -854,30 +823,6 @@ static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
} // end namespace llvm
namespace llvm {
-
-// Loop vectorization cost-model hints how the scalar epilogue loop should be
-// lowered.
-enum ScalarEpilogueLowering {
-
- // The default: allowing scalar epilogues.
- CM_ScalarEpilogueAllowed,
-
- // Vectorization with OptForSize: don't allow epilogues.
- CM_ScalarEpilogueNotAllowedOptSize,
-
- // A special case of vectorisation with OptForSize: loops with a very small
- // trip count are considered for vectorization under OptForSize, thereby
- // making sure the cost of their loop body is dominant, free of runtime
- // guards and scalar iteration overheads.
- CM_ScalarEpilogueNotAllowedLowTripLoop,
-
- // Loop hint predicate indicating an epilogue is undesired.
- CM_ScalarEpilogueNotNeededUsePredicate,
-
- // Directive indicating we must either tail fold or not vectorize
- CM_ScalarEpilogueNotAllowedUsePredicate
-};
-
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
/// In many cases vectorization is not profitable. This can happen because of
@@ -889,19 +834,17 @@ class LoopVectorizationCostModel {
friend class LoopVectorizationPlanner;
public:
- LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
- PredicatedScalarEvolution &PSE, LoopInfo *LI,
- LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, DemandedBits *DB,
AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, const Function *F,
- const LoopVectorizeHints *Hints,
+ LoopVectorizeHints &Hints,
InterleavedAccessInfo &IAI,
ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
- : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
- TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
- Hints(Hints), InterleaveInfo(IAI) {
+ : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
+ AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {
if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
initializeVScaleForTuning();
CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
@@ -970,7 +913,7 @@ class LoopVectorizationCostModel {
/// the IsOrdered flag of RdxDesc is set and we do not allow reordering
/// of FP operations.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
- return !Hints->allowReordering() && RdxDesc.isOrdered();
+ return !Hints.allowReordering() && RdxDesc.isOrdered();
}
/// \returns The smallest bitwidth each instruction can be represented with.
@@ -1280,7 +1223,7 @@ class LoopVectorizationCostModel {
/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
bool requiresScalarEpilogue(bool IsVectorizing) const {
- if (!isScalarEpilogueAllowed()) {
+ if (!Hints.isScalarEpilogueAllowed()) {
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
return false;
}
@@ -1301,12 +1244,6 @@ class LoopVectorizationCostModel {
return false;
}
- /// Returns true if a scalar epilogue is not allowed due to optsize or a
- /// loop hint annotation.
- bool isScalarEpilogueAllowed() const {
- return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
- }
-
/// Returns the TailFoldingStyle that is best for the current loop.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
if (!ChosenTailFoldingStyle)
@@ -1345,8 +1282,9 @@ class LoopVectorizationCostModel {
return;
// If for some reason EVL mode is unsupported, fallback to a scalar epilogue
// if it's allowed, or DataWithoutLaneMask otherwise.
- if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
- ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
+ if (Hints.isScalarEpilogueAllowed() ||
+ Hints.getScalarEpilogue() ==
+ LoopVectorizeHints::SEK_NotNeededUsePredicate)
ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
else
ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
@@ -1558,15 +1496,6 @@ class LoopVectorizationCostModel {
DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
PredicatedBBsAfterVectorization;
- /// Records whether it is allowed to have the original scalar loop execute at
- /// least once. This may be needed as a fallback loop in case runtime
- /// aliasing/dependence checks fail, or to handle the tail/remainder
- /// iterations when the trip count is unknown or doesn't divide by the VF,
- /// or as a peel-loop to handle gaps in interleave-groups.
- /// Under optsize and when the trip count is very small we don't allow any
- /// iterations to execute in the scalar loop.
- ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
-
/// Control finally chosen tail folding style. The first element is used if
/// the IV update may overflow, the second element - if it does not.
std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
@@ -1713,8 +1642,8 @@ class LoopVectorizationCostModel {
const Function *TheFunction;
- /// Loop Vectorize Hint.
- const LoopVectorizeHints *Hints;
+ /// Loop Vectorize Hints.
+ LoopVectorizeHints &Hints;
/// The interleave access information contains groups of interleaved accesses
/// with the same stride and close to each other.
@@ -2976,7 +2905,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
Legal->isMaskRequired(I);
bool LoadAccessWithGapsRequiresEpilogMasking =
isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
- !isScalarEpilogueAllowed();
+ !Hints.isScalarEpilogueAllowed();
bool StoreAccessWithGapsRequiresMasking =
isa<StoreInst>(I) && !Group->isFull();
if (!PredicatedAccessRequiresMasking &&
@@ -3313,7 +3242,7 @@ bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
return false;
- if (Hints->isScalableVectorizationDisabled()) {
+ if (Hints.isScalableVectorizationDisabled()) {
reportVectorizationInfo("Scalable vectorization is explicitly disabled",
"ScalableVectorizationDisabled", ORE, TheLoop);
return false;
@@ -3538,21 +3467,21 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return FixedScalableVFPair::getNone();
}
- switch (ScalarEpilogueStatus) {
- case CM_ScalarEpilogueAllowed:
+ switch (Hints.getScalarEpilogue()) {
+ case LoopVectorizeHints::SEK_Allowed:
return computeFeasibleMaxVF(MaxTC, UserVF, false);
- case CM_ScalarEpilogueNotAllowedUsePredicate:
+ case LoopVectorizeHints::SEK_NotAllowedUsePredicate:
[[fallthrough]];
- case CM_ScalarEpilogueNotNeededUsePredicate:
+ case LoopVectorizeHints::SEK_NotNeededUsePredicate:
LLVM_DEBUG(
dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n");
break;
- case CM_ScalarEpilogueNotAllowedLowTripLoop:
+ case LoopVectorizeHints::SEK_NotAllowedLowTripLoop:
// fallthrough as a special case of OptForSize
- case CM_ScalarEpilogueNotAllowedOptSize:
- if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
+ case LoopVectorizeHints::SEK_NotAllowedOptSize:
+ if (Hints.getScalarEpilogue() == LoopVectorizeHints::SEK_NotAllowedOptSize)
LLVM_DEBUG(
dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
else
@@ -3636,7 +3565,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// If we have a low-trip-count, and the fixed-width VF is known to divide
// the trip count but the scalable factor does not, use the ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/160125
More information about the llvm-commits
mailing list