[llvm] [LV] Make ScalarEpilogue a LoopVectorizeHint (PR #160125)

Mon Sep 22 08:13:30 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-risc-v

Author: Ramkumar Ramachandra (artagnon)

<details>
<summary>Changes</summary>

Move ScalarEpilogueLowering from LoopVectorizationCostModel into LoopVectorizeHints, as it is really a kind of hint that's dependent on the user's preferences. The patch has the side-effect of costing the scalar-epilogue correctly in some corner cases.

---

Patch is 34.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/160125.diff


5 Files Affected:

- (modified) llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h (+49-1) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (+91-3) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+47-166) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll (+3-3) 


``````````diff

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 405d4a742f37b..28bc90c6cf046 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -64,7 +64,8 @@ class LoopVectorizeHints {
     HK_FORCE,
     HK_ISVECTORIZED,
     HK_PREDICATE,
-    HK_SCALABLE
+    HK_SCALABLE,
+    HK_SCALAREPILOGUE
   };
 
   /// Hint - associates name and validation with the hint value.
@@ -97,6 +98,9 @@ class LoopVectorizeHints {
   /// Says whether we should use fixed width or scalable vectorization.
   Hint Scalable;
 
+  /// Hint specifying how we should lower the scalar epilogue.
+  Hint ScalarEpilogue;
+
   /// Return the loop metadata prefix.
   static StringRef Prefix() { return "llvm.loop."; }
 
@@ -121,6 +125,33 @@ class LoopVectorizeHints {
     SK_PreferScalable = 1
   };
 
+  /// Whether it is allowed to have the original scalar loop execute at least
+  /// once. This may be needed as a fallback loop in case runtime
+  /// aliasing/dependence checks fail, or to handle the tail/remainder
+  /// iterations when the trip count is unknown or doesn't divide by the VF, or
+  /// as a peel-loop to handle gaps in interleave-groups. Under optsize and when
+  /// the trip count is very small we don't allow anyiterations to execute in
+  /// the scalar loop.
+  enum ScalarEpilogueKind {
+    // The default: allowing scalar epilogues.
+    SEK_Allowed,
+
+    // Vectorization with OptForSize: don't allow epilogues.
+    SEK_NotAllowedOptSize,
+
+    // A special case of vectorisation with OptForSize: loops with a very small
+    // trip count are considered for vectorization under OptForSize, thereby
+    // making sure the cost of their loop body is dominant, free of runtime
+    // guards and scalar iteration overheads.
+    SEK_NotAllowedLowTripLoop,
+
+    // Loop hint predicate indicating an epilogue is undesired.
+    SEK_NotNeededUsePredicate,
+
+    // Directive indicating we must either tail fold or not vectorize
+    SEK_NotAllowedUsePredicate
+  };
+
   LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced,
                      OptimizationRemarkEmitter &ORE,
                      const TargetTransformInfo *TTI = nullptr);
@@ -156,6 +187,21 @@ class LoopVectorizeHints {
       return FK_Disabled;
     return (ForceKind)Force.Value;
   }
+  ScalarEpilogueKind getScalarEpilogue() const {
+    return static_cast<ScalarEpilogueKind>(ScalarEpilogue.Value);
+  }
+  bool isScalarEpilogueAllowed() const {
+    return ScalarEpilogue.Value == SEK_Allowed;
+  }
+  void setScalarEpilogue(ScalarEpilogueKind SEK) { ScalarEpilogue.Value = SEK; }
+
+  // Determine how to lower the scalar epilogue and set it. Depends on 1)
+  // optimising for minimum code-size, 2) predicate compiler options, 3) loop
+  // hints forcing predication, and 4) a TTI hook that analyses whether the loop
+  // is suitable for predication.
+  void setScalarEpilogue(ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
+                         TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL,
+                         InterleavedAccessInfo *IAI);
 
   /// \return true if scalable vectorization has been explicitly disabled.
   bool isScalableVectorizationDisabled() const {
@@ -196,6 +242,8 @@ class LoopVectorizeHints {
 
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter &ORE;
+
+  const TargetTransformInfo *TTI;
 };
 
 /// This holds vectorization requirements that must be verified late in
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index ff35db14f7094..28ed2d6edf65e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -83,6 +83,34 @@ static cl::opt<bool> EnableHistogramVectorization(
     "enable-histogram-loop-vectorization", cl::init(false), cl::Hidden,
     cl::desc("Enables autovectorization of some loops containing histograms"));
 
+// Option prefer-predicate-over-epilogue indicates that an epilogue is
+// undesired, that predication is preferred, and this lists all options. I.e.,
+// the vectorizer will try to fold the tail-loop (epilogue) into the vector body
+// and predicate the instructions accordingly. If tail-folding fails, there are
+// different fallback strategies depending on these values:
+enum class PreferPredicateTy {
+  ScalarEpilogue = 0,
+  PredicateElseScalarEpilogue,
+  PredicateOrDontVectorize
+};
+
+static cl::opt<PreferPredicateTy> PreferPredicateOverEpilogue(
+    "prefer-predicate-over-epilogue",
+    cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden,
+    cl::desc("Tail-folding and predication preferences over creating a scalar "
+             "epilogue loop."),
+    cl::values(
+        clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue",
+                   "Don't tail-predicate loops, create scalar epilogue"),
+        clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
+                   "predicate-else-scalar-epilogue",
+                   "prefer tail-folding, create scalar epilogue if tail "
+                   "folding fails."),
+        clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
+                   "predicate-dont-vectorize",
+                   "prefers tail-folding, don't attempt vectorization if "
+                   "tail-folding fails.")));
+
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
 
@@ -100,6 +128,8 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
   case HK_PREDICATE:
   case HK_SCALABLE:
     return (Val == 0 || Val == 1);
+  case HK_SCALAREPILOGUE:
+    return Val <= SEK_NotAllowedUsePredicate;
   }
   return false;
 }
@@ -114,7 +144,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
       IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
       Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
       Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
-      TheLoop(L), ORE(ORE) {
+      ScalarEpilogue("scalarepilogue", SEK_Allowed, HK_SCALAREPILOGUE),
+      TheLoop(L), ORE(ORE), TTI(TTI) {
   // Populate values with existing loop metadata.
   getHintsFromMetadata();
 
@@ -302,8 +333,8 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
     return;
   unsigned Val = C->getZExtValue();
 
-  Hint *Hints[] = {&Width,        &Interleave, &Force,
-                   &IsVectorized, &Predicate,  &Scalable};
+  Hint *Hints[] = {&Width,     &Interleave, &Force,         &IsVectorized,
+                   &Predicate, &Scalable,   &ScalarEpilogue};
   for (auto *H : Hints) {
     if (Name == H->Name) {
       if (H->validate(Val))
@@ -315,6 +346,63 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
   }
 }
 
+void LoopVectorizeHints::setScalarEpilogue(ProfileSummaryInfo *PSI,
+                                           BlockFrequencyInfo *BFI,
+                                           TargetLibraryInfo *TLI,
+                                           LoopVectorizationLegality &LVL,
+                                           InterleavedAccessInfo *IAI) {
+  // 1) OptSize takes precedence over all other options, i.e. if this is set,
+  // don't look at hints or options, and don't request a scalar epilogue.
+  // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
+  // LoopAccessInfo (due to code dependency and not being able to reliably get
+  // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
+  // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
+  // versioning when the vectorization is forced, unlike hasOptSize. So revert
+  // back to the old way and vectorize with versioning when forced. See D81345.)
+  Function *F = TheLoop->getHeader()->getParent();
+  if (F->hasOptSize() ||
+      (llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
+                                   PGSOQueryType::IRPass) &&
+       getForce() != LoopVectorizeHints::FK_Enabled)) {
+    ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotAllowedOptSize;
+    return;
+  }
+
+  // 2) If set, obey the directives
+  if (PreferPredicateOverEpilogue.getNumOccurrences()) {
+    switch (PreferPredicateOverEpilogue) {
+    case PreferPredicateTy::ScalarEpilogue:
+      ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+      return;
+    case PreferPredicateTy::PredicateElseScalarEpilogue:
+      ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+      return;
+    case PreferPredicateTy::PredicateOrDontVectorize:
+      ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotAllowedUsePredicate;
+      return;
+    };
+  }
+
+  // 3) If set, obey the hints
+  switch (getPredicate()) {
+  case LoopVectorizeHints::FK_Enabled:
+    ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+    return;
+  case LoopVectorizeHints::FK_Disabled:
+    ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+    return;
+  };
+
+  // 4) if the TTI hook indicates this is profitable, request predication.
+  TailFoldingInfo TFI(TLI, &LVL, IAI);
+  if (TTI->preferPredicateOverEpilogue(&TFI)) {
+    ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+    return;
+  }
+
+  ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+}
+
 // Return true if the inner loop \p Lp is uniform with regard to the outer loop
 // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
 // executing the inner loop will execute the same iterations). This check is
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ca092dcfcb492..7ed66969dd489 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -197,37 +197,6 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
     cl::desc("The maximum allowed number of runtime memory checks"));
 
-// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
-// that predication is preferred, and this lists all options. I.e., the
-// vectorizer will try to fold the tail-loop (epilogue) into the vector body
-// and predicate the instructions accordingly. If tail-folding fails, there are
-// different fallback strategies depending on these values:
-namespace PreferPredicateTy {
-  enum Option {
-    ScalarEpilogue = 0,
-    PredicateElseScalarEpilogue,
-    PredicateOrDontVectorize
-  };
-} // namespace PreferPredicateTy
-
-static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
-    "prefer-predicate-over-epilogue",
-    cl::init(PreferPredicateTy::ScalarEpilogue),
-    cl::Hidden,
-    cl::desc("Tail-folding and predication preferences over creating a scalar "
-             "epilogue loop."),
-    cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
-                         "scalar-epilogue",
-                         "Don't tail-predicate loops, create scalar epilogue"),
-              clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
-                         "predicate-else-scalar-epilogue",
-                         "prefer tail-folding, create scalar epilogue if tail "
-                         "folding fails."),
-              clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
-                         "predicate-dont-vectorize",
-                         "prefers tail-folding, don't attempt vectorization if "
-                         "tail-folding fails.")));
-
 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
     "force-tail-folding-style", cl::desc("Force the tail folding style"),
     cl::init(TailFoldingStyle::None),
@@ -854,30 +823,6 @@ static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
 } // end namespace llvm
 
 namespace llvm {
-
-// Loop vectorization cost-model hints how the scalar epilogue loop should be
-// lowered.
-enum ScalarEpilogueLowering {
-
-  // The default: allowing scalar epilogues.
-  CM_ScalarEpilogueAllowed,
-
-  // Vectorization with OptForSize: don't allow epilogues.
-  CM_ScalarEpilogueNotAllowedOptSize,
-
-  // A special case of vectorisation with OptForSize: loops with a very small
-  // trip count are considered for vectorization under OptForSize, thereby
-  // making sure the cost of their loop body is dominant, free of runtime
-  // guards and scalar iteration overheads.
-  CM_ScalarEpilogueNotAllowedLowTripLoop,
-
-  // Loop hint predicate indicating an epilogue is undesired.
-  CM_ScalarEpilogueNotNeededUsePredicate,
-
-  // Directive indicating we must either tail fold or not vectorize
-  CM_ScalarEpilogueNotAllowedUsePredicate
-};
-
 /// LoopVectorizationCostModel - estimates the expected speedups due to
 /// vectorization.
 /// In many cases vectorization is not profitable. This can happen because of
@@ -889,19 +834,17 @@ class LoopVectorizationCostModel {
   friend class LoopVectorizationPlanner;
 
 public:
-  LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
-                             PredicatedScalarEvolution &PSE, LoopInfo *LI,
-                             LoopVectorizationLegality *Legal,
+  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
+                             LoopInfo *LI, LoopVectorizationLegality *Legal,
                              const TargetTransformInfo &TTI,
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
                              AssumptionCache *AC,
                              OptimizationRemarkEmitter *ORE, const Function *F,
-                             const LoopVectorizeHints *Hints,
+                             LoopVectorizeHints &Hints,
                              InterleavedAccessInfo &IAI,
                              ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
-      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
-        TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
-        Hints(Hints), InterleaveInfo(IAI) {
+      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
+        AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {
     if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
       initializeVScaleForTuning();
     CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
@@ -970,7 +913,7 @@ class LoopVectorizationCostModel {
   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
   /// of FP operations.
   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
-    return !Hints->allowReordering() && RdxDesc.isOrdered();
+    return !Hints.allowReordering() && RdxDesc.isOrdered();
   }
 
   /// \returns The smallest bitwidth each instruction can be represented with.
@@ -1280,7 +1223,7 @@ class LoopVectorizationCostModel {
   /// Returns true if we're required to use a scalar epilogue for at least
   /// the final iteration of the original loop.
   bool requiresScalarEpilogue(bool IsVectorizing) const {
-    if (!isScalarEpilogueAllowed()) {
+    if (!Hints.isScalarEpilogueAllowed()) {
       LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
       return false;
     }
@@ -1301,12 +1244,6 @@ class LoopVectorizationCostModel {
     return false;
   }
 
-  /// Returns true if a scalar epilogue is not allowed due to optsize or a
-  /// loop hint annotation.
-  bool isScalarEpilogueAllowed() const {
-    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
-  }
-
   /// Returns the TailFoldingStyle that is best for the current loop.
   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
     if (!ChosenTailFoldingStyle)
@@ -1345,8 +1282,9 @@ class LoopVectorizationCostModel {
       return;
     // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
     // if it's allowed, or DataWithoutLaneMask otherwise.
-    if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
-        ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
+    if (Hints.isScalarEpilogueAllowed() ||
+        Hints.getScalarEpilogue() ==
+            LoopVectorizeHints::SEK_NotNeededUsePredicate)
       ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
     else
       ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
@@ -1558,15 +1496,6 @@ class LoopVectorizationCostModel {
   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
       PredicatedBBsAfterVectorization;
 
-  /// Records whether it is allowed to have the original scalar loop execute at
-  /// least once. This may be needed as a fallback loop in case runtime
-  /// aliasing/dependence checks fail, or to handle the tail/remainder
-  /// iterations when the trip count is unknown or doesn't divide by the VF,
-  /// or as a peel-loop to handle gaps in interleave-groups.
-  /// Under optsize and when the trip count is very small we don't allow any
-  /// iterations to execute in the scalar loop.
-  ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
-
   /// Control finally chosen tail folding style. The first element is used if
   /// the IV update may overflow, the second element - if it does not.
   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
@@ -1713,8 +1642,8 @@ class LoopVectorizationCostModel {
 
   const Function *TheFunction;
 
-  /// Loop Vectorize Hint.
-  const LoopVectorizeHints *Hints;
+  /// Loop Vectorize Hints.
+  LoopVectorizeHints &Hints;
 
   /// The interleave access information contains groups of interleaved accesses
   /// with the same stride and close to each other.
@@ -2976,7 +2905,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
       Legal->isMaskRequired(I);
   bool LoadAccessWithGapsRequiresEpilogMasking =
       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
-      !isScalarEpilogueAllowed();
+      !Hints.isScalarEpilogueAllowed();
   bool StoreAccessWithGapsRequiresMasking =
       isa<StoreInst>(I) && !Group->isFull();
   if (!PredicatedAccessRequiresMasking &&
@@ -3313,7 +3242,7 @@ bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
     return false;
 
-  if (Hints->isScalableVectorizationDisabled()) {
+  if (Hints.isScalableVectorizationDisabled()) {
     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
                             "ScalableVectorizationDisabled", ORE, TheLoop);
     return false;
@@ -3538,21 +3467,21 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return FixedScalableVFPair::getNone();
   }
 
-  switch (ScalarEpilogueStatus) {
-  case CM_ScalarEpilogueAllowed:
+  switch (Hints.getScalarEpilogue()) {
+  case LoopVectorizeHints::SEK_Allowed:
     return computeFeasibleMaxVF(MaxTC, UserVF, false);
-  case CM_ScalarEpilogueNotAllowedUsePredicate:
+  case LoopVectorizeHints::SEK_NotAllowedUsePredicate:
     [[fallthrough]];
-  case CM_ScalarEpilogueNotNeededUsePredicate:
+  case LoopVectorizeHints::SEK_NotNeededUsePredicate:
     LLVM_DEBUG(
         dbgs() << "LV: vector predicate hint/switch found.\n"
                << "LV: Not allowing scalar epilogue, creating predicated "
                << "vector loop.\n");
     break;
-  case CM_ScalarEpilogueNotAllowedLowTripLoop:
+  case LoopVectorizeHints::SEK_NotAllowedLowTripLoop:
     // fallthrough as a special case of OptForSize
-  case CM_ScalarEpilogueNotAllowedOptSize:
-    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
+  case LoopVectorizeHints::SEK_NotAllowedOptSize:
+    if (Hints.getScalarEpilogue() == LoopVectorizeHints::SEK_NotAllowedOptSize)
       LLVM_DEBUG(
           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
     else
@@ -3636,7 +3565,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
       // If we have a low-trip-count, and the fixed-width VF is known to divide
       // the trip count but the scalable factor does not, use the ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/160125