[llvm] [LV] Make ScalarEpilogue a LoopVectorizeHint (PR #160125)

Mon Sep 22 08:12:58 PDT 2025

https://github.com/artagnon created https://github.com/llvm/llvm-project/pull/160125

Move ScalarEpilogueLowering from LoopVectorizationCostModel into LoopVectorizeHints, as it is really a kind of hint that's dependent on the user's preferences. The patch has the side-effect of costing the scalar-epilogue correctly in some corner cases.

>From 8720c85b69a96d0f5c7f23cf7eb1687c84cc9c6f Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Mon, 22 Sep 2025 15:19:26 +0100
Subject: [PATCH] [LV] Make ScalarEpilogue a LoopVectorizeHint

Move ScalarEpilogueLowering from LoopVectorizationCostModel into
LoopVectorizeHints, as it is really a kind of hint that's dependent on
the user's preferences. The patch has the side-effect of costing the
scalar-epilogue correctly in some corner cases.
---
 .../Vectorize/LoopVectorizationLegality.h     |  50 +++-
 .../Vectorize/LoopVectorizationLegality.cpp   |  94 +++++++-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 213 ++++--------------
 .../LoopVectorize/RISCV/dead-ops-cost.ll      |   4 +-
 .../LoopVectorize/RISCV/strided-accesses.ll   |   6 +-
 5 files changed, 192 insertions(+), 175 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 405d4a742f37b..28bc90c6cf046 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -64,7 +64,8 @@ class LoopVectorizeHints {
     HK_FORCE,
     HK_ISVECTORIZED,
     HK_PREDICATE,
-    HK_SCALABLE
+    HK_SCALABLE,
+    HK_SCALAREPILOGUE
   };
 
   /// Hint - associates name and validation with the hint value.
@@ -97,6 +98,9 @@ class LoopVectorizeHints {
   /// Says whether we should use fixed width or scalable vectorization.
   Hint Scalable;
 
+  /// Hint specifying how we should lower the scalar epilogue.
+  Hint ScalarEpilogue;
+
   /// Return the loop metadata prefix.
   static StringRef Prefix() { return "llvm.loop."; }
 
@@ -121,6 +125,33 @@ class LoopVectorizeHints {
     SK_PreferScalable = 1
   };
 
+  /// Whether it is allowed to have the original scalar loop execute at least
+  /// once. This may be needed as a fallback loop in case runtime
+  /// aliasing/dependence checks fail, or to handle the tail/remainder
+  /// iterations when the trip count is unknown or doesn't divide by the VF, or
+  /// as a peel-loop to handle gaps in interleave-groups. Under optsize and when
+  /// the trip count is very small we don't allow anyiterations to execute in
+  /// the scalar loop.
+  enum ScalarEpilogueKind {
+    // The default: allowing scalar epilogues.
+    SEK_Allowed,
+
+    // Vectorization with OptForSize: don't allow epilogues.
+    SEK_NotAllowedOptSize,
+
+    // A special case of vectorisation with OptForSize: loops with a very small
+    // trip count are considered for vectorization under OptForSize, thereby
+    // making sure the cost of their loop body is dominant, free of runtime
+    // guards and scalar iteration overheads.
+    SEK_NotAllowedLowTripLoop,
+
+    // Loop hint predicate indicating an epilogue is undesired.
+    SEK_NotNeededUsePredicate,
+
+    // Directive indicating we must either tail fold or not vectorize
+    SEK_NotAllowedUsePredicate
+  };
+
   LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced,
                      OptimizationRemarkEmitter &ORE,
                      const TargetTransformInfo *TTI = nullptr);
@@ -156,6 +187,21 @@ class LoopVectorizeHints {
       return FK_Disabled;
     return (ForceKind)Force.Value;
   }
+  ScalarEpilogueKind getScalarEpilogue() const {
+    return static_cast<ScalarEpilogueKind>(ScalarEpilogue.Value);
+  }
+  bool isScalarEpilogueAllowed() const {
+    return ScalarEpilogue.Value == SEK_Allowed;
+  }
+  void setScalarEpilogue(ScalarEpilogueKind SEK) { ScalarEpilogue.Value = SEK; }
+
+  // Determine how to lower the scalar epilogue and set it. Depends on 1)
+  // optimising for minimum code-size, 2) predicate compiler options, 3) loop
+  // hints forcing predication, and 4) a TTI hook that analyses whether the loop
+  // is suitable for predication.
+  void setScalarEpilogue(ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
+                         TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL,
+                         InterleavedAccessInfo *IAI);
 
   /// \return true if scalable vectorization has been explicitly disabled.
   bool isScalableVectorizationDisabled() const {
@@ -196,6 +242,8 @@ class LoopVectorizeHints {
 
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter &ORE;
+
+  const TargetTransformInfo *TTI;
 };
 
 /// This holds vectorization requirements that must be verified late in
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index ff35db14f7094..28ed2d6edf65e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -83,6 +83,34 @@ static cl::opt<bool> EnableHistogramVectorization(
     "enable-histogram-loop-vectorization", cl::init(false), cl::Hidden,
     cl::desc("Enables autovectorization of some loops containing histograms"));
 
+// Option prefer-predicate-over-epilogue indicates that an epilogue is
+// undesired, that predication is preferred, and this lists all options. I.e.,
+// the vectorizer will try to fold the tail-loop (epilogue) into the vector body
+// and predicate the instructions accordingly. If tail-folding fails, there are
+// different fallback strategies depending on these values:
+enum class PreferPredicateTy {
+  ScalarEpilogue = 0,
+  PredicateElseScalarEpilogue,
+  PredicateOrDontVectorize
+};
+
+static cl::opt<PreferPredicateTy> PreferPredicateOverEpilogue(
+    "prefer-predicate-over-epilogue",
+    cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden,
+    cl::desc("Tail-folding and predication preferences over creating a scalar "
+             "epilogue loop."),
+    cl::values(
+        clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue",
+                   "Don't tail-predicate loops, create scalar epilogue"),
+        clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
+                   "predicate-else-scalar-epilogue",
+                   "prefer tail-folding, create scalar epilogue if tail "
+                   "folding fails."),
+        clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
+                   "predicate-dont-vectorize",
+                   "prefers tail-folding, don't attempt vectorization if "
+                   "tail-folding fails.")));
+
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
 
@@ -100,6 +128,8 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
   case HK_PREDICATE:
   case HK_SCALABLE:
     return (Val == 0 || Val == 1);
+  case HK_SCALAREPILOGUE:
+    return Val <= SEK_NotAllowedUsePredicate;
   }
   return false;
 }
@@ -114,7 +144,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
       IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
       Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
       Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
-      TheLoop(L), ORE(ORE) {
+      ScalarEpilogue("scalarepilogue", SEK_Allowed, HK_SCALAREPILOGUE),
+      TheLoop(L), ORE(ORE), TTI(TTI) {
   // Populate values with existing loop metadata.
   getHintsFromMetadata();
 
@@ -302,8 +333,8 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
     return;
   unsigned Val = C->getZExtValue();
 
-  Hint *Hints[] = {&Width,        &Interleave, &Force,
-                   &IsVectorized, &Predicate,  &Scalable};
+  Hint *Hints[] = {&Width,     &Interleave, &Force,         &IsVectorized,
+                   &Predicate, &Scalable,   &ScalarEpilogue};
   for (auto *H : Hints) {
     if (Name == H->Name) {
       if (H->validate(Val))
@@ -315,6 +346,63 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
   }
 }
 
+void LoopVectorizeHints::setScalarEpilogue(ProfileSummaryInfo *PSI,
+                                           BlockFrequencyInfo *BFI,
+                                           TargetLibraryInfo *TLI,
+                                           LoopVectorizationLegality &LVL,
+                                           InterleavedAccessInfo *IAI) {
+  // 1) OptSize takes precedence over all other options, i.e. if this is set,
+  // don't look at hints or options, and don't request a scalar epilogue.
+  // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
+  // LoopAccessInfo (due to code dependency and not being able to reliably get
+  // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
+  // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
+  // versioning when the vectorization is forced, unlike hasOptSize. So revert
+  // back to the old way and vectorize with versioning when forced. See D81345.)
+  Function *F = TheLoop->getHeader()->getParent();
+  if (F->hasOptSize() ||
+      (llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
+                                   PGSOQueryType::IRPass) &&
+       getForce() != LoopVectorizeHints::FK_Enabled)) {
+    ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotAllowedOptSize;
+    return;
+  }
+
+  // 2) If set, obey the directives
+  if (PreferPredicateOverEpilogue.getNumOccurrences()) {
+    switch (PreferPredicateOverEpilogue) {
+    case PreferPredicateTy::ScalarEpilogue:
+      ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+      return;
+    case PreferPredicateTy::PredicateElseScalarEpilogue:
+      ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+      return;
+    case PreferPredicateTy::PredicateOrDontVectorize:
+      ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotAllowedUsePredicate;
+      return;
+    };
+  }
+
+  // 3) If set, obey the hints
+  switch (getPredicate()) {
+  case LoopVectorizeHints::FK_Enabled:
+    ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+    return;
+  case LoopVectorizeHints::FK_Disabled:
+    ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+    return;
+  };
+
+  // 4) if the TTI hook indicates this is profitable, request predication.
+  TailFoldingInfo TFI(TLI, &LVL, IAI);
+  if (TTI->preferPredicateOverEpilogue(&TFI)) {
+    ScalarEpilogue.Value = LoopVectorizeHints::SEK_NotNeededUsePredicate;
+    return;
+  }
+
+  ScalarEpilogue.Value = LoopVectorizeHints::SEK_Allowed;
+}
+
 // Return true if the inner loop \p Lp is uniform with regard to the outer loop
 // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
 // executing the inner loop will execute the same iterations). This check is
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ca092dcfcb492..7ed66969dd489 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -197,37 +197,6 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
     cl::desc("The maximum allowed number of runtime memory checks"));
 
-// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
-// that predication is preferred, and this lists all options. I.e., the
-// vectorizer will try to fold the tail-loop (epilogue) into the vector body
-// and predicate the instructions accordingly. If tail-folding fails, there are
-// different fallback strategies depending on these values:
-namespace PreferPredicateTy {
-  enum Option {
-    ScalarEpilogue = 0,
-    PredicateElseScalarEpilogue,
-    PredicateOrDontVectorize
-  };
-} // namespace PreferPredicateTy
-
-static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
-    "prefer-predicate-over-epilogue",
-    cl::init(PreferPredicateTy::ScalarEpilogue),
-    cl::Hidden,
-    cl::desc("Tail-folding and predication preferences over creating a scalar "
-             "epilogue loop."),
-    cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
-                         "scalar-epilogue",
-                         "Don't tail-predicate loops, create scalar epilogue"),
-              clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
-                         "predicate-else-scalar-epilogue",
-                         "prefer tail-folding, create scalar epilogue if tail "
-                         "folding fails."),
-              clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
-                         "predicate-dont-vectorize",
-                         "prefers tail-folding, don't attempt vectorization if "
-                         "tail-folding fails.")));
-
 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
     "force-tail-folding-style", cl::desc("Force the tail folding style"),
     cl::init(TailFoldingStyle::None),
@@ -854,30 +823,6 @@ static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
 } // end namespace llvm
 
 namespace llvm {
-
-// Loop vectorization cost-model hints how the scalar epilogue loop should be
-// lowered.
-enum ScalarEpilogueLowering {
-
-  // The default: allowing scalar epilogues.
-  CM_ScalarEpilogueAllowed,
-
-  // Vectorization with OptForSize: don't allow epilogues.
-  CM_ScalarEpilogueNotAllowedOptSize,
-
-  // A special case of vectorisation with OptForSize: loops with a very small
-  // trip count are considered for vectorization under OptForSize, thereby
-  // making sure the cost of their loop body is dominant, free of runtime
-  // guards and scalar iteration overheads.
-  CM_ScalarEpilogueNotAllowedLowTripLoop,
-
-  // Loop hint predicate indicating an epilogue is undesired.
-  CM_ScalarEpilogueNotNeededUsePredicate,
-
-  // Directive indicating we must either tail fold or not vectorize
-  CM_ScalarEpilogueNotAllowedUsePredicate
-};
-
 /// LoopVectorizationCostModel - estimates the expected speedups due to
 /// vectorization.
 /// In many cases vectorization is not profitable. This can happen because of
@@ -889,19 +834,17 @@ class LoopVectorizationCostModel {
   friend class LoopVectorizationPlanner;
 
 public:
-  LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
-                             PredicatedScalarEvolution &PSE, LoopInfo *LI,
-                             LoopVectorizationLegality *Legal,
+  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
+                             LoopInfo *LI, LoopVectorizationLegality *Legal,
                              const TargetTransformInfo &TTI,
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
                              AssumptionCache *AC,
                              OptimizationRemarkEmitter *ORE, const Function *F,
-                             const LoopVectorizeHints *Hints,
+                             LoopVectorizeHints &Hints,
                              InterleavedAccessInfo &IAI,
                              ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
-      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
-        TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
-        Hints(Hints), InterleaveInfo(IAI) {
+      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
+        AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {
     if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
       initializeVScaleForTuning();
     CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
@@ -970,7 +913,7 @@ class LoopVectorizationCostModel {
   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
   /// of FP operations.
   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
-    return !Hints->allowReordering() && RdxDesc.isOrdered();
+    return !Hints.allowReordering() && RdxDesc.isOrdered();
   }
 
   /// \returns The smallest bitwidth each instruction can be represented with.
@@ -1280,7 +1223,7 @@ class LoopVectorizationCostModel {
   /// Returns true if we're required to use a scalar epilogue for at least
   /// the final iteration of the original loop.
   bool requiresScalarEpilogue(bool IsVectorizing) const {
-    if (!isScalarEpilogueAllowed()) {
+    if (!Hints.isScalarEpilogueAllowed()) {
       LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
       return false;
     }
@@ -1301,12 +1244,6 @@ class LoopVectorizationCostModel {
     return false;
   }
 
-  /// Returns true if a scalar epilogue is not allowed due to optsize or a
-  /// loop hint annotation.
-  bool isScalarEpilogueAllowed() const {
-    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
-  }
-
   /// Returns the TailFoldingStyle that is best for the current loop.
   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
     if (!ChosenTailFoldingStyle)
@@ -1345,8 +1282,9 @@ class LoopVectorizationCostModel {
       return;
     // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
     // if it's allowed, or DataWithoutLaneMask otherwise.
-    if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
-        ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
+    if (Hints.isScalarEpilogueAllowed() ||
+        Hints.getScalarEpilogue() ==
+            LoopVectorizeHints::SEK_NotNeededUsePredicate)
       ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
     else
       ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
@@ -1558,15 +1496,6 @@ class LoopVectorizationCostModel {
   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
       PredicatedBBsAfterVectorization;
 
-  /// Records whether it is allowed to have the original scalar loop execute at
-  /// least once. This may be needed as a fallback loop in case runtime
-  /// aliasing/dependence checks fail, or to handle the tail/remainder
-  /// iterations when the trip count is unknown or doesn't divide by the VF,
-  /// or as a peel-loop to handle gaps in interleave-groups.
-  /// Under optsize and when the trip count is very small we don't allow any
-  /// iterations to execute in the scalar loop.
-  ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
-
   /// Control finally chosen tail folding style. The first element is used if
   /// the IV update may overflow, the second element - if it does not.
   std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
@@ -1713,8 +1642,8 @@ class LoopVectorizationCostModel {
 
   const Function *TheFunction;
 
-  /// Loop Vectorize Hint.
-  const LoopVectorizeHints *Hints;
+  /// Loop Vectorize Hints.
+  LoopVectorizeHints &Hints;
 
   /// The interleave access information contains groups of interleaved accesses
   /// with the same stride and close to each other.
@@ -2976,7 +2905,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
       Legal->isMaskRequired(I);
   bool LoadAccessWithGapsRequiresEpilogMasking =
       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
-      !isScalarEpilogueAllowed();
+      !Hints.isScalarEpilogueAllowed();
   bool StoreAccessWithGapsRequiresMasking =
       isa<StoreInst>(I) && !Group->isFull();
   if (!PredicatedAccessRequiresMasking &&
@@ -3313,7 +3242,7 @@ bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
     return false;
 
-  if (Hints->isScalableVectorizationDisabled()) {
+  if (Hints.isScalableVectorizationDisabled()) {
     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
                             "ScalableVectorizationDisabled", ORE, TheLoop);
     return false;
@@ -3538,21 +3467,21 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return FixedScalableVFPair::getNone();
   }
 
-  switch (ScalarEpilogueStatus) {
-  case CM_ScalarEpilogueAllowed:
+  switch (Hints.getScalarEpilogue()) {
+  case LoopVectorizeHints::SEK_Allowed:
     return computeFeasibleMaxVF(MaxTC, UserVF, false);
-  case CM_ScalarEpilogueNotAllowedUsePredicate:
+  case LoopVectorizeHints::SEK_NotAllowedUsePredicate:
     [[fallthrough]];
-  case CM_ScalarEpilogueNotNeededUsePredicate:
+  case LoopVectorizeHints::SEK_NotNeededUsePredicate:
     LLVM_DEBUG(
         dbgs() << "LV: vector predicate hint/switch found.\n"
                << "LV: Not allowing scalar epilogue, creating predicated "
                << "vector loop.\n");
     break;
-  case CM_ScalarEpilogueNotAllowedLowTripLoop:
+  case LoopVectorizeHints::SEK_NotAllowedLowTripLoop:
     // fallthrough as a special case of OptForSize
-  case CM_ScalarEpilogueNotAllowedOptSize:
-    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
+  case LoopVectorizeHints::SEK_NotAllowedOptSize:
+    if (Hints.getScalarEpilogue() == LoopVectorizeHints::SEK_NotAllowedOptSize)
       LLVM_DEBUG(
           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
     else
@@ -3636,7 +3565,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
       // If we have a low-trip-count, and the fixed-width VF is known to divide
       // the trip count but the scalable factor does not, use the fixed-width
       // factor in preference to allow the generation of a non-predicated loop.
-      if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
+      if (Hints.getScalarEpilogue() ==
+              LoopVectorizeHints::SEK_NotAllowedLowTripLoop &&
           NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
         LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
                              "remain for any chosen VF.\n");
@@ -3677,14 +3607,16 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 
   // If there was a tail-folding hint/switch, but we can't fold the tail by
   // masking, fallback to a vectorization with a scalar epilogue.
-  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+  if (Hints.getScalarEpilogue() ==
+      LoopVectorizeHints::SEK_NotNeededUsePredicate) {
     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
                          "scalar epilogue instead.\n");
-    ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+    Hints.setScalarEpilogue(LoopVectorizeHints::SEK_Allowed);
     return MaxFactors;
   }
 
-  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
+  if (Hints.getScalarEpilogue() ==
+      LoopVectorizeHints::SEK_NotAllowedUsePredicate) {
     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
     return FixedScalableVFPair::getNone();
   }
@@ -4344,7 +4276,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
     return Result;
   }
 
-  if (!CM.isScalarEpilogueAllowed()) {
+  if (!Hints.isScalarEpilogueAllowed()) {
     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
                          "epilogue is allowed.\n");
     return Result;
@@ -4538,7 +4470,7 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
   // 3. We don't interleave if we think that we will spill registers to memory
   // due to the increased register pressure.
 
-  if (!CM.isScalarEpilogueAllowed())
+  if (!Hints.isScalarEpilogueAllowed())
     return 1;
 
   if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
@@ -5288,7 +5220,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
 
   // Calculate the cost of the whole interleaved group.
   bool UseMaskForGaps =
-      (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
+      (Group->requiresScalarEpilogue() && !Hints.isScalarEpilogueAllowed()) ||
       (isa<StoreInst>(I) && !Group->isFull());
   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
       InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,
@@ -8718,7 +8650,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // single VPInterleaveRecipe at its insertion point.
   VPlanTransforms::runPass(VPlanTransforms::createInterleaveGroups, *Plan,
                            InterleaveGroups, RecipeBuilder,
-                           CM.isScalarEpilogueAllowed());
+                           Hints.isScalarEpilogueAllowed());
 
   // Replace VPValues for known constant strides.
   VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
@@ -9155,7 +9087,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
   const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
   if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(0)) {
     assert((!CM.OptForSize ||
-            CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
+            Hints.getForce() == LoopVectorizeHints::FK_Enabled) &&
            "Cannot SCEV check stride or overflow when optimizing for size");
     VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
                                       HasBranchWeights);
@@ -9169,7 +9101,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
 
     if (CM.OptForSize) {
       assert(
-          CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
+          Hints.getForce() == LoopVectorizeHints::FK_Enabled &&
           "Cannot emit memory checks when optimizing for size, unless forced "
           "to vectorize.");
       ORE->emit([&]() {
@@ -9227,55 +9159,6 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
   State.set(this, DerivedIV, VPLane(0));
 }
 
-// Determine how to lower the scalar epilogue, which depends on 1) optimising
-// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
-// predication, and 4) a TTI hook that analyses whether the loop is suitable
-// for predication.
-static ScalarEpilogueLowering getScalarEpilogueLowering(
-    Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
-    BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
-    LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
-  // 1) OptSize takes precedence over all other options, i.e. if this is set,
-  // don't look at hints or options, and don't request a scalar epilogue.
-  // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
-  // LoopAccessInfo (due to code dependency and not being able to reliably get
-  // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
-  // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
-  // versioning when the vectorization is forced, unlike hasOptSize. So revert
-  // back to the old way and vectorize with versioning when forced. See D81345.)
-  if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
-                                                      PGSOQueryType::IRPass) &&
-                          Hints.getForce() != LoopVectorizeHints::FK_Enabled))
-    return CM_ScalarEpilogueNotAllowedOptSize;
-
-  // 2) If set, obey the directives
-  if (PreferPredicateOverEpilogue.getNumOccurrences()) {
-    switch (PreferPredicateOverEpilogue) {
-    case PreferPredicateTy::ScalarEpilogue:
-      return CM_ScalarEpilogueAllowed;
-    case PreferPredicateTy::PredicateElseScalarEpilogue:
-      return CM_ScalarEpilogueNotNeededUsePredicate;
-    case PreferPredicateTy::PredicateOrDontVectorize:
-      return CM_ScalarEpilogueNotAllowedUsePredicate;
-    };
-  }
-
-  // 3) If set, obey the hints
-  switch (Hints.getPredicate()) {
-  case LoopVectorizeHints::FK_Enabled:
-    return CM_ScalarEpilogueNotNeededUsePredicate;
-  case LoopVectorizeHints::FK_Disabled:
-    return CM_ScalarEpilogueAllowed;
-  };
-
-  // 4) if the TTI hook indicates this is profitable, request predication.
-  TailFoldingInfo TFI(TLI, &LVL, IAI);
-  if (TTI->preferPredicateOverEpilogue(&TFI))
-    return CM_ScalarEpilogueNotNeededUsePredicate;
-
-  return CM_ScalarEpilogueAllowed;
-}
-
 // Process the loop in the VPlan-native vectorization path. This path builds
 // VPlan upfront in the vectorization pipeline, which allows to apply
 // VPlan-to-VPlan transformations from the very beginning without modifying the
@@ -9296,11 +9179,9 @@ static bool processLoopInVPlanNativePath(
   Function *F = L->getHeader()->getParent();
   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
 
-  ScalarEpilogueLowering SEL =
-      getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
-
-  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints, IAI, PSI, BFI);
+  Hints.setScalarEpilogue(PSI, BFI, TLI, *LVL, &IAI);
+  LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+                                Hints, IAI, PSI, BFI);
   // Use the planner for outer loop vectorization.
   // TODO: CM is not used at this point inside the planner. Turn CM into an
   // optional argument if we don't need it in the future.
@@ -9421,7 +9302,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
                                         VectorizationFactor &VF, Loop *L,
                                         PredicatedScalarEvolution &PSE,
                                         VPCostContext &CostCtx, VPlan &Plan,
-                                        ScalarEpilogueLowering SEL,
+                                        const LoopVectorizeHints &Hints,
                                         std::optional<unsigned> VScale) {
   InstructionCost TotalCost = Checks.getCost();
   if (!TotalCost.isValid())
@@ -9498,7 +9379,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
   // epilogue is allowed, choose the next closest multiple of VF. This should
   // partly compensate for ignoring the epilogue cost.
   uint64_t MinTC = std::max(MinTC1, MinTC2);
-  if (SEL == CM_ScalarEpilogueAllowed)
+  if (Hints.isScalarEpilogueAllowed())
     MinTC = alignTo(MinTC, IntVF);
   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
 
@@ -9926,8 +9807,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Check the function attributes and profiles to find out if this function
   // should be optimized for size.
-  ScalarEpilogueLowering SEL =
-      getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
+  Hints.setScalarEpilogue(PSI, BFI, TLI, LVL, &IAI);
 
   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
   // count by optimizing for size, to minimize overheads.
@@ -9943,12 +9823,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       LLVM_DEBUG(dbgs() << "\n");
       // Predicate tail-folded loops are efficient even when the loop
       // iteration count is low. However, setting the epilogue policy to
-      // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
+      // `SEK_NotAllowedLowTripLoop` prevents vectorizing loops
       // with runtime checks. It's more effective to let
       // `isOutsideLoopWorkProfitable` determine if vectorization is
       // beneficial for the loop.
-      if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
-        SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+      if (Hints.getScalarEpilogue() !=
+          LoopVectorizeHints::SEK_NotNeededUsePredicate)
+        Hints.setScalarEpilogue(LoopVectorizeHints::SEK_NotAllowedLowTripLoop);
     }
   }
 
@@ -9999,8 +9880,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   }
 
   // Use the cost model.
-  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
-                                F, &Hints, IAI, PSI, BFI);
+  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
+                                Hints, IAI, PSI, BFI);
   // Use the planner for vectorization.
   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
                                ORE);
@@ -10046,7 +9927,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                           CM.CostKind);
     if (!ForceVectorization &&
         !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
-                                     LVP.getPlanFor(VF.Width), SEL,
+                                     LVP.getPlanFor(VF.Width), Hints,
                                      CM.getVScaleForTuning())) {
       ORE->emit([&]() {
         return OptimizationRemarkAnalysisAliasing(
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 65373e4128b7f..49cc0b7ad5def 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -83,7 +83,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 6)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 8)
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
@@ -168,7 +168,7 @@ define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 770, [[UMAX3]]
 ; CHECK-NEXT:    [[SMAX4:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[SMAX4]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP2]], 19
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP2]], 24
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 4c84913eea23d..e7df49c9062e2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -706,7 +706,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-UF2-NEXT:    [[P21:%.*]] = ptrtoint ptr [[P2:%.*]] to i64
 ; NOSTRIDED-UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; NOSTRIDED-UF2-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
-; NOSTRIDED-UF2-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 12)
+; NOSTRIDED-UF2-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 16)
 ; NOSTRIDED-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX]]
 ; NOSTRIDED-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; NOSTRIDED-UF2:       vector.scevcheck:
@@ -862,7 +862,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-UF2-NEXT:  entry:
 ; STRIDED-UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; STRIDED-UF2-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
-; STRIDED-UF2-NEXT:    [[UMAX9:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 79)
+; STRIDED-UF2-NEXT:    [[UMAX9:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 80)
 ; STRIDED-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX9]]
 ; STRIDED-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; STRIDED-UF2:       vector.scevcheck:
@@ -1224,7 +1224,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-UF2-NEXT:  entry:
 ; STRIDED-UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; STRIDED-UF2-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
-; STRIDED-UF2-NEXT:    [[UMAX6:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 28)
+; STRIDED-UF2-NEXT:    [[UMAX6:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 32)
 ; STRIDED-UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[UMAX6]]
 ; STRIDED-UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; STRIDED-UF2:       vector.memcheck: