[llvm] 774fc63 - [LV] Consider minimum vscale assmuption for RT check cost.

Tue Jul 5 01:42:45 PDT 2022

Author: Florian Hahn
Date: 2022-07-05T09:41:58+01:00
New Revision: 774fc63490939c1afdc02f20ec48467991d3c548

URL: https://github.com/llvm/llvm-project/commit/774fc63490939c1afdc02f20ec48467991d3c548
DIFF: https://github.com/llvm/llvm-project/commit/774fc63490939c1afdc02f20ec48467991d3c548.diff

LOG: [LV] Consider minimum vscale assmuption for RT check cost.

For scalable VFs, the minimum assumed vscale needs to be included in the
cost-computation, otherwise a smaller VF may be used for RT check cost
computation than was used for earlier cost computations.

Fixes a RISCV test failing with UBSan due to both scalar and vector
loops having the same cost.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cdf9d4d96e94..03f1d9ec5b37 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1559,14 +1559,14 @@ class LoopVectorizationCostModel {
     Scalars.clear();
   }
 
-private:
-  unsigned NumPredStores = 0;
-
   /// Convenience function that returns the value of vscale_range iff
   /// vscale_range.min == vscale_range.max or otherwise returns the value
   /// returned by the corresponding TLI method.
   Optional<unsigned> getVScaleForTuning() const;
 
+private:
+  unsigned NumPredStores = 0;
+
   /// \return An upper bound for the vectorization factors for both
   /// fixed and scalable vectorization, where the minimum-known number of
   /// elements is a power-of-2 larger than zero. If scalable vectorization is
@@ -10243,7 +10243,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
 }
 
 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
-                                       VectorizationFactor &VF, Loop *L,
+                                       VectorizationFactor &VF,
+                                       Optional<unsigned> VScale, Loop *L,
                                        ScalarEvolution &SE) {
   InstructionCost CheckCost = Checks.getCost();
   if (!CheckCost.isValid())
@@ -10295,6 +10296,12 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
   // the computations are performed on doubles, not integers and the result
   // is rounded up, hence we get an upper estimate of the TC.
   unsigned IntVF = VF.Width.getKnownMinValue();
+  if (VF.Width.isScalable()) {
+    unsigned AssumedMinimumVscale = 1;
+    if (VScale)
+      AssumedMinimumVscale = *VScale;
+    IntVF *= AssumedMinimumVscale;
+  }
   double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
   double RtC = *CheckCost.getValue();
   double MinTC1 = RtC / (ScalarC - VecCOverVF);
@@ -10308,6 +10315,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
   double MinTC2 = RtC * 10 / ScalarC;
 
+  dbgs() << ScalarC << " " << RtC << " " << VecCOverVF << "\n";
   // Now pick the larger minimum. If it is not a multiple of VF, choose the
   // next closest multiple of VF. This should partly compensate for ignoring
   // the epilogue cost.
@@ -10520,7 +10528,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     if (!ForceVectorization &&
-        !areRuntimeChecksProfitable(Checks, VF, L, *PSE.getSE())) {
+        !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
+                                    *PSE.getSE())) {
       ORE->emit([&]() {
         return OptimizationRemarkAnalysisAliasing(
                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),