[llvm] 644a965 - [LV] Vectorize cases with larger number of RT checks, execute only if profitable.

Mon Jul 4 07:12:01 PDT 2022

Author: Florian Hahn
Date: 2022-07-04T15:11:39+01:00
New Revision: 644a965c1efef68f22d9495e4cefbb599c214788

URL: https://github.com/llvm/llvm-project/commit/644a965c1efef68f22d9495e4cefbb599c214788
DIFF: https://github.com/llvm/llvm-project/commit/644a965c1efef68f22d9495e4cefbb599c214788.diff

LOG: [LV] Vectorize cases with larger number of RT checks, execute only if profitable.

This patch replaces the tight hard cut-off for the number of runtime
checks with a more accurate cost-driven approach.

The new approach allows vectorization with a larger number of runtime
checks in general, but only executes the vector loop (and runtime checks) if
considered profitable at runtime. Profitable here means that the cost-model
indicates that the runtime check cost + vector loop cost < scalar loop cost.

To do that, LV computes the minimum trip count for which runtime check cost
+ vector-loop-cost < scalar loop cost.

Note that there is still a hard cut-off to avoid excessive compile-time/code-size
increases, but it is much larger than the original limit.

The performance impact on standard test-suites like SPEC2006/SPEC2006/MultiSource
is mostly neutral, but the new approach can give substantial gains in cases where
we failed to vectorize before due to the over-aggressive cut-offs.

On AArch64 with -O3, I didn't observe any regressions outside the noise level (<0.4%)
and there are the following execution time improvements. Both `IRSmk` and `srad` are relatively short running, but the changes are far above the noise level for them on my benchmark system.

```
CFP2006/447.dealII/447.dealII    -1.9%
CINT2017rate/525.x264_r/525.x264_r    -2.2%
ASC_Sequoia/IRSmk/IRSmk       -9.2%
Rodinia/srad/srad     -36.1%
```

`size` regressions on AArch64 with -O3 are

```
MultiSource/Applications/hbd/hbd                 90256.00   106768.00 18.3%
MultiSourc...ks/ASCI_Purple/SMG2000/smg2000     240676.00   257268.00  6.9%
MultiSourc...enchmarks/mafft/pairlocalalign     472603.00   489131.00  3.5%
External/S...2017rate/525.x264_r/525.x264_r     613831.00   630343.00  2.7%
External/S...NT2006/464.h264ref/464.h264ref     818920.00   835448.00  2.0%
External/S...te/538.imagick_r/538.imagick_r    1994730.00  2027754.00  1.7%
MultiSourc...nchmarks/tramp3d-v4/tramp3d-v4    1236471.00  1253015.00  1.3%
MultiSource/Applications/oggenc/oggenc         2108147.00  2124675.00  0.8%
External/S.../CFP2006/447.dealII/447.dealII    4742999.00  4759559.00  0.3%
External/S...rate/510.parest_r/510.parest_r   14206377.00 14239433.00  0.2%
```

Reviewed By: lebedev.ri, ebrevnov, dmgreen

Differential Revision: https://reviews.llvm.org/D109368

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
    llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
    llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
    llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
    llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
    llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
    llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
    llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
    llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index b01bd222b252e..2d69c2f866422 100644

--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -219,16 +219,9 @@ class LoopVectorizationRequirements {
       ExactFPMathInst = I;
   }
 
-  void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
-
   Instruction *getExactFPInst() { return ExactFPMathInst; }
 
-  unsigned getNumRuntimePointerChecks() const {
-    return NumRuntimePointerChecks;
-  }
-
 private:
-  unsigned NumRuntimePointerChecks = 0;
   Instruction *ExactFPMathInst = nullptr;
 };
 

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 6242d9a93fc1a..8b5aca8d2ade2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -993,7 +993,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     }
   }
 
-  Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
   PSE.addPredicate(LAI->getPSE().getPredicate());
   return true;
 }

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 0cb2032fa45ab..933ea9e023021 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -33,7 +33,6 @@ class LoopInfo;
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class PredicatedScalarEvolution;
-class LoopVectorizationRequirements;
 class LoopVectorizeHints;
 class OptimizationRemarkEmitter;
 class TargetTransformInfo;
@@ -191,6 +190,10 @@ struct VectorizationFactor {
   /// Cost of the scalar loop.
   InstructionCost ScalarCost;
 
+  /// The minimum trip count required to make vectorization profitable, e.g. due
+  /// to runtime checks.
+  ElementCount MinProfitableTripCount;
+
   VectorizationFactor(ElementCount Width, InstructionCost Cost,
                       InstructionCost ScalarCost)
       : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {}
@@ -268,8 +271,6 @@ class LoopVectorizationPlanner {
 
   const LoopVectorizeHints &Hints;
 
-  LoopVectorizationRequirements &Requirements;
-
   OptimizationRemarkEmitter *ORE;
 
   SmallVector<VPlanPtr, 4> VPlans;
@@ -285,10 +286,9 @@ class LoopVectorizationPlanner {
                            InterleavedAccessInfo &IAI,
                            PredicatedScalarEvolution &PSE,
                            const LoopVectorizeHints &Hints,
-                           LoopVectorizationRequirements &Requirements,
                            OptimizationRemarkEmitter *ORE)
       : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
-        PSE(PSE), Hints(Hints), Requirements(Requirements), ORE(ORE) {}
+        PSE(PSE), Hints(Hints), ORE(ORE) {}
 
   /// Plan how to best vectorize, return the best VF and its cost, or None if
   /// vectorization and interleaving should be avoided up front.

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b637b2d5ddaed..25ec712f43ddf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -196,10 +196,9 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(
              "value are vectorized only if no scalar iteration overheads "
              "are incurred."));
 
-static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
-    "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
-    cl::desc("The maximum allowed number of runtime memory checks with a "
-             "vectorize(enable) pragma."));
+static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
+    "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+    cl::desc("The maximum allowed number of runtime memory checks"));
 
 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
 // that predication is preferred, and this lists all options. I.e., the
@@ -442,6 +441,7 @@ class InnerLoopVectorizer {
                       const TargetLibraryInfo *TLI,
                       const TargetTransformInfo *TTI, AssumptionCache *AC,
                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
+                      ElementCount MinProfitableTripCount,
                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
@@ -453,6 +453,11 @@ class InnerLoopVectorizer {
     // of the original loop header may change as the transformation happens.
     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
+
+    if (MinProfitableTripCount.isZero())
+      this->MinProfitableTripCount = VecWidth;
+    else
+      this->MinProfitableTripCount = MinProfitableTripCount;
   }
 
   virtual ~InnerLoopVectorizer() = default;
@@ -656,6 +661,8 @@ class InnerLoopVectorizer {
   /// vector elements.
   ElementCount VF;
 
+  ElementCount MinProfitableTripCount;
+
   /// The vectorization unroll factor to use. Each scalar is vectorized to this
   /// many 
diff erent vector instructions.
   unsigned UF;
@@ -735,6 +742,7 @@ class InnerLoopUnroller : public InnerLoopVectorizer {
                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
+                            ElementCount::getFixed(1),
                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
                             BFI, PSI, Check) {}
 
@@ -783,8 +791,8 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
       GeneratedRTChecks &Checks)
       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
-                            EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
-                            Checks),
+                            EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
+                            CM, BFI, PSI, Checks),
         EPI(EPI) {}
 
   // Override this function to handle the more complex control flow around the
@@ -1849,14 +1857,17 @@ class GeneratedRTChecks {
 
   DominatorTree *DT;
   LoopInfo *LI;
+  TargetTransformInfo *TTI;
 
   SCEVExpander SCEVExp;
   SCEVExpander MemCheckExp;
 
+  bool CostTooHigh = false;
+
 public:
   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
-                    const DataLayout &DL)
-      : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
+                    TargetTransformInfo *TTI, const DataLayout &DL)
+      : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
         MemCheckExp(SE, DL, "scev.check") {}
 
   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
@@ -1867,6 +1878,15 @@ class GeneratedRTChecks {
   void Create(Loop *L, const LoopAccessInfo &LAI,
               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
 
+    // Hard cutoff to limit compile-time increase in case a very large number of
+    // runtime checks needs to be generated.
+    // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
+    // profile info.
+    CostTooHigh =
+        LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
+    if (CostTooHigh)
+      return;
+
     BasicBlock *LoopHeader = L->getHeader();
     BasicBlock *Preheader = L->getLoopPreheader();
 
@@ -1938,6 +1958,44 @@ class GeneratedRTChecks {
     }
   }
 
+  InstructionCost getCost() {
+    if (SCEVCheckBlock || MemCheckBlock)
+      LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
+
+    if (CostTooHigh) {
+      InstructionCost Cost;
+      Cost.setInvalid();
+      LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
+      return Cost;
+    }
+
+    InstructionCost RTCheckCost = 0;
+    if (SCEVCheckBlock)
+      for (Instruction &I : *SCEVCheckBlock) {
+        if (SCEVCheckBlock->getTerminator() == &I)
+          continue;
+        InstructionCost C =
+            TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+        LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
+        RTCheckCost += C;
+      }
+    if (MemCheckBlock)
+      for (Instruction &I : *MemCheckBlock) {
+        if (MemCheckBlock->getTerminator() == &I)
+          continue;
+        InstructionCost C =
+            TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+        LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
+        RTCheckCost += C;
+      }
+
+    if (SCEVCheckBlock || MemCheckBlock)
+      LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
+                        << "\n");
+
+    return RTCheckCost;
+  }
+
   /// Remove the created SCEV & memory runtime check blocks & instructions, if
   /// unused.
   ~GeneratedRTChecks() {
@@ -2880,9 +2938,16 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   // If tail is to be folded, vector loop takes care of all iterations.
   Type *CountTy = Count->getType();
   Value *CheckMinIters = Builder.getFalse();
-  Value *Step = createStepForVF(Builder, CountTy, VF, UF);
+  auto CreateStep = [&]() {
+    // Create step with max(MinProTripCount, UF * VF).
+    if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue())
+      return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
+    return createStepForVF(Builder, CountTy, VF, UF);
+  };
+
   if (!Cost->foldTailByMasking())
-    CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
+    CheckMinIters =
+        Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
   else if (VF.isScalable()) {
     // vscale is not necessarily a power-of-2, which means we cannot guarantee
     // an overflow to zero when updating induction variables and so an
@@ -2894,8 +2959,9 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
 
     // Don't execute the vector loop if (UMax - n) < (VF * UF).
-    CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step);
+    CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
   }
+
   // Create new preheader for vector loop.
   LoopVectorPreHeader =
       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
@@ -2920,7 +2986,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
 }
 
 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
-
   BasicBlock *const SCEVCheckBlock =
       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
   if (!SCEVCheckBlock)
@@ -7363,14 +7428,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   return VectorizationFactor::Disabled();
 }
 
-bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const {
-  unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
-  return (NumRuntimePointerChecks >
-              VectorizerParams::RuntimeMemoryCheckThreshold &&
-          !Hints.allowReordering()) ||
-         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
-}
-
 Optional<VectorizationFactor>
 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -10103,8 +10160,7 @@ static bool processLoopInVPlanNativePath(
   // Use the planner for outer loop vectorization.
   // TODO: CM is not used at this point inside the planner. Turn CM into an
   // optional argument if we don't need it in the future.
-  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
-                               Requirements, ORE);
+  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
 
   // Get user vectorization factor.
   ElementCount UserVF = Hints.getWidth();
@@ -10123,10 +10179,10 @@ static bool processLoopInVPlanNativePath(
   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
 
   {
-    GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
+    GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
                              F->getParent()->getDataLayout());
-    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
-                           &CM, BFI, PSI, Checks);
+    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
+                           VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                       << L->getHeader()->getParent()->getName() << "\"\n");
     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
@@ -10183,6 +10239,94 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
   }
 }
 
+static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
+                                       VectorizationFactor &VF, Loop *L,
+                                       ScalarEvolution &SE) {
+  InstructionCost CheckCost = Checks.getCost();
+  if (!CheckCost.isValid())
+    return false;
+
+  // When interleaving only scalar and vector cost will be equal, which in turn
+  // would lead to a divide by 0. Fall back to hard threshold.
+  if (VF.Width.isScalar()) {
+    if (CheckCost > VectorizeMemoryCheckThreshold) {
+      LLVM_DEBUG(
+          dbgs()
+          << "LV: Interleaving only is not profitable due to runtime checks\n");
+      return false;
+    }
+    return true;
+  }
+
+  // First, compute the minimum iteration count required so that the vector
+  // loop outperforms the scalar loop.
+  //  The total cost of the scalar loop is
+  //   ScalarC * TC
+  //  where
+  //  * TC is the actual trip count of the loop.
+  //  * ScalarC is the cost of a single scalar iteration.
+  //
+  //  The total cost of the vector loop is
+  //    RtC + VecC * (TC / VF) + EpiC
+  //  where
+  //  * RtC is the cost of the generated runtime checks
+  //  * VecC is the cost of a single vector iteration.
+  //  * TC is the actual trip count of the loop
+  //  * VF is the vectorization factor
+  //  * EpiCost is the cost of the generated epilogue, including the cost
+  //    of the remaining scalar operations.
+  //
+  // Vectorization is profitable once the total vector cost is less than the
+  // total scalar cost:
+  //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
+  //
+  // Now we can compute the minimum required trip count TC as
+  //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
+  //
+  // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
+  // the computations are performed on doubles, not integers and the result
+  // is rounded up, hence we get an upper estimate of the TC.
+  unsigned IntVF = VF.Width.getKnownMinValue();
+  double ScalarC = *VF.ScalarCost.getValue();
+  double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
+  double RtC = *CheckCost.getValue();
+  double MinTC1 = RtC / (ScalarC - VecCOverVF);
+
+  // Second, compute a minimum iteration count so that the cost of the
+  // runtime checks is only a fraction of the total scalar loop cost. This
+  // adds a loop-dependent bound on the overhead incurred if the runtime
+  // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
+  // * TC. To bound the runtime check to be a fraction 1/X of the scalar
+  // cost, compute
+  //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
+  double MinTC2 = RtC * 10 / ScalarC;
+
+  // Now pick the larger minimum. If it is not a multiple of VF, choose the
+  // next closest multiple of VF. This should partly compensate for ignoring
+  // the epilogue cost.
+  uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
+  VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
+
+  LLVM_DEBUG(
+      dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
+             << VF.MinProfitableTripCount << "\n");
+
+  // Skip vectorization if the expected trip count is less than the minimum
+  // required trip count.
+  if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
+    if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
+                                VF.MinProfitableTripCount)) {
+      LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
+                           "trip count < minimum profitable VF ("
+                        << *ExpectedTC << " < " << VF.MinProfitableTripCount
+                        << ")\n");
+
+      return false;
+    }
+  }
+  return true;
+}
+
 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
                                !EnableLoopInterleaving),
@@ -10340,8 +10484,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   CM.collectElementTypesForWidening();
 
   // Use the planner for vectorization.
-  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
-                               Requirements, ORE);
+  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
 
   // Get user vectorization factor and interleave count.
   ElementCount UserVF = Hints.getWidth();
@@ -10353,21 +10496,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
 
-  GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
+  GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
                            F->getParent()->getDataLayout());
   if (MaybeVF) {
-    if (LVP.requiresTooManyRuntimeChecks()) {
-      ORE->emit([&]() {
-        return OptimizationRemarkAnalysisAliasing(
-                   DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
-                   L->getHeader())
-               << "loop not vectorized: cannot prove it is safe to reorder "
-                  "memory operations";
-      });
-      LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
-      Hints.emitRemarkWithHints();
-      return false;
-    }
     VF = *MaybeVF;
     // Select the interleave count.
     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
@@ -10377,6 +10508,13 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     //  they turn out to not be profitable.
     if (VF.Width.isVector() || SelectedIC > 1)
       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+
+    // Check if it is profitable to vectorize with runtime checks.
+    bool ForceVectorization =
+        Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+    if (!ForceVectorization &&
+        !areRuntimeChecksProfitable(Checks, VF, L, *PSE.getSE()))
+      return false;
   }
 
   // Identify the diagnostic messages that should be produced.
@@ -10533,8 +10671,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         if (!MainILV.areSafetyChecksAdded())
           DisableRuntimeUnroll = true;
       } else {
-        InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
-                               &LVL, &CM, BFI, PSI, Checks);
+        InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
+                               VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
+                               PSI, Checks);
 
         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll
index afea25a635723..20182dbbf0acc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll
@@ -1,4 +1,5 @@
-; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -S %s | FileCheck %s
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -S %s | FileCheck --check-prefixes=CHECK,DEFAULT %s
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefixes=CHECK,THRESHOLD %s
 
 ; Tests for loops with large numbers of runtime checks. Check that loops are
 ; vectorized, if the loop trip counts are large and the impact of the runtime
@@ -57,11 +58,13 @@ exit:
   ret void
 }
 
-; FIXME
 ; The trip count in the loop in this function high enough to warrant large runtime checks.
 ; CHECK-LABEL: define {{.*}} @test_tc_big_enough
-; CHECK-NOT: vector.memcheck
-; CHECK-NOT: vector.body
+; DEFAULT: vector.memcheck
+; DEFAULT: vector.body
+; THRESHOLD-NOT: vector.memcheck
+; THRESHOLD-NOT: vector.body
+;
 define void @test_tc_big_enough(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) {
 entry:
   br label %loop
@@ -112,8 +115,11 @@ exit:
 
 define void @test_tc_unknown(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2, i64 %N) {
 ; CHECK-LABEL: define void @test_tc_unknown
-; CHECK-NOT: vector.memcheck
-; CHECK-NOT: vector.body
+; DEFAULT:       [[ADD:%.+]] = add i64 %N, 1
+; DEFAULT-NEXT:  [[C:%.+]] = icmp ult i64 [[ADD]], 16
+; DEFAULT-NEXT:  br i1 [[C]], label %scalar.ph, label %vector.memcheck
+; THRESHOLD-NOT: vector.memcheck
+; THRESHOLD-NOT: vector.body
 ;
 entry:
   br label %loop

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index 03061361b390e..ec7732f4d4e9d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -10,9 +10,9 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
 ; CHECK-LABEL: @simple_memset(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
index e5f6d46202e43..7c7f171ab431b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
@@ -8,9 +8,9 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
 ; CHECK-LABEL: @simple_memset(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -102,9 +102,9 @@ define void @cond_memset(i32 %val, i32* noalias readonly %cond_ptr, i32* noalias
 ; CHECK-LABEL: @cond_memset(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index af7d92ecbb194..7d4ddaf2c4568 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -10,9 +10,9 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
 ; CHECK-LABEL: @simple_memset(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -63,9 +63,9 @@ define void @simple_memcpy(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-LABEL: @simple_memcpy(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -123,9 +123,9 @@ define void @copy_stride4(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 -1, [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 -1, [[TMP2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -193,9 +193,9 @@ define void @simple_gather_scatter(i32* noalias %dst, i32* noalias %src, i32* no
 ; CHECK-LABEL: @simple_gather_scatter(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -253,9 +253,9 @@ while.end.loopexit:                               ; preds = %while.body
 define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
 ; CHECK-LABEL: @uniform_load(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -312,9 +312,9 @@ for.end:                                          ; preds = %for.body, %entry
 define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i32* noalias readonly %cond, i64 %n) #0 {
 ; CHECK-LABEL: @cond_uniform_load(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -389,9 +389,9 @@ for.end:                                          ; preds = %for.inc, %entry
 define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
 ; CHECK-LABEL: @uniform_store(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -445,9 +445,9 @@ define void @simple_fdiv(float* noalias %dst, float* noalias %src, i64 %n) #0 {
 ; CHECK-LABEL: @simple_fdiv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -507,9 +507,9 @@ define i32 @add_reduction_i32(i32* %ptr, i64 %n) #0 {
 ; CHECK-LABEL: @add_reduction_i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -563,9 +563,9 @@ define float @add_reduction_f32(float* %ptr, i64 %n) #0 {
 ; CHECK-LABEL: @add_reduction_f32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[UMAX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -617,9 +617,9 @@ while.end.loopexit:                               ; preds = %while.body
 define i32 @cond_xor_reduction(i32* noalias %a, i32* noalias %cond, i64 %N) #0 {
 ; CHECK-LABEL: @cond_xor_reduction(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 -1, [[N:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index 0007427eecb96..3c88257940b3b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -863,7 +863,7 @@ define void @test_gather_not_profitable_pr48429(i32 %d, float* readonly noalias
 ; AVX512-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4
 ; AVX512-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
 ; AVX512-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16
+; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 32
 ; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; AVX512:       vector.memcheck:
 ; AVX512-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
index 31a70a36763b7..d0cf24d690fc2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 
-; RUN: opt -runtime-memory-check-threshold=9 -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S -debug %s 2>&1 | FileCheck %s
+; RUN: opt -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S -debug %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -10,11 +10,63 @@ declare double @llvm.pow.f64(double, double)
 
 ; Test case where the memory runtime checks and vector body is more expensive
 ; than running the scalar loop.
-; TODO: should not be vectorized.
 define void @test(double* nocapture %A, double* nocapture %B, double* nocapture %C, double* nocapture %D, double* nocapture %E) {
+
+; CHECK: Calculating cost of runtime checks:
+; CHECK-NEXT:  0  for   {{.+}} = getelementptr double, double* %A, i64 16
+; CHECK-NEXT:  0  for   {{.+}} = bitcast double*
+; CHECK-NEXT:  0  for   {{.+}} = getelementptr double, double* %B, i64 16
+; CHECK-NEXT:  0  for   {{.+}} = bitcast double*
+; CHECK-NEXT:  0  for   {{.+}} = getelementptr double, double* %E, i64 16
+; CHECK-NEXT:  0  for   {{.+}} = bitcast double*
+; CHECK-NEXT:  0  for   {{.+}} = getelementptr double, double* %C, i64 16
+; CHECK-NEXT:  0  for   {{.+}} = bitcast double*
+; CHECK-NEXT:  0  for   {{.+}} = getelementptr double, double* %D, i64 16
+; CHECK-NEXT:  0  for   {{.+}} = bitcast double*
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = and i1
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = and i1
+; CHECK-NEXT:  1  for   {{.+}} = or i1
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = and i1
+; CHECK-NEXT:  1  for   {{.+}} = or i1
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = and i1
+; CHECK-NEXT:  1  for   {{.+}} = or i1
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = and i1
+; CHECK-NEXT:  1  for   {{.+}} = or i1
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = and i1
+; CHECK-NEXT:  1  for   {{.+}} = or i1
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = and i1
+; CHECK-NEXT:  1  for   {{.+}} = or i1
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = and i1
+; CHECK-NEXT:  1  for   {{.+}} = or i1
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = icmp ult i8*
+; CHECK-NEXT:  1  for   {{.+}} = and i1
+; CHECK-NEXT:  1  for   {{.+}} = or i1
+; CHECK-NEXT: Total cost of runtime checks: 35
+
+; CHECK: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (16 < 70)
+;
 ; CHECK-LABEL: @test(
-; CHECK: vector.memcheck
-; CHECK: vector.body
+; CHECK-NEXT: entry:
+; CHECK-NEXT:  br label %for.body
+; CHECK-NOT: vector.memcheck
+; CHECK-NOT: vector.body
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
index 2fb6a13f9cf40..246b4e16e281a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
@@ -15,7 +15,7 @@ define void @foo(i8 addrspace(1)* align 8 dereferenceable_or_null(16), i8 addrsp
 ; CHECK-NEXT:    [[DOT12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP1:%.*]], i64 16
 ; CHECK-NEXT:    [[DOT13:%.*]] = bitcast i8 addrspace(1)* [[DOT12]] to i8 addrspace(1)* addrspace(1)*
 ; CHECK-NEXT:    [[UMAX2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2:%.*]], i64 1)
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX2]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX2]], 20
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 1)

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
index cf75d309d458f..12508734cd678 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
@@ -40,7 +40,7 @@ define i32 @main(i32* %ptr) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 1
 ; CHECK-NEXT:    [[UMIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP5]], [[UMIN1]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 32
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i8 [[CONV3]], -1

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
index 5419efd454c73..d03e7b96b58a9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
@@ -22,7 +22,7 @@ define {} addrspace(10)* @japi1_vect_42283({} addrspace(10)** nocapture readonly
 ; CHECK-NEXT:    [[DOTELT1:%.*]] = getelementptr inbounds { {} addrspace(10)*, i64 }, { {} addrspace(10)*, i64 } addrspace(10)* [[TMP6]], i64 0, i32 1
 ; CHECK-NEXT:    [[DOTUNPACK2:%.*]] = load i64, i64 addrspace(10)* [[DOTELT1]], align 8, !tbaa [[TBAA8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP11]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP11]], 28
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]])

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
index 1b5622dbde7d1..86fae7804060e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-vectorize -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE
-; RUN: opt < %s -loop-vectorize -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -8,20 +7,12 @@ target triple = "x86_64-unknown-linux"
 ; First loop produced diagnostic pass remark.
 ;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2)
 ; Second loop produces diagnostic analysis remark.
-;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations
-
-; First loop produced diagnostic pass remark.
-;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 2)
-; Second loop produces diagnostic pass remark.
-;OVERRIDE: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations
+;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1)
 
 ; We are vectorizing with 6 runtime checks.
 ;CHECK-LABEL: func1x6(
 ;CHECK: <4 x i32>
 ;CHECK: ret
-;OVERRIDE-LABEL: func1x6(
-;OVERRIDE: <4 x i32>
-;OVERRIDE: ret
 define i32 @func1x6(i32* nocapture %out, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
 entry:
   br label %for.body
@@ -52,14 +43,10 @@ for.end:                                          ; preds = %for.body
   ret i32 undef
 }
 
-; We are not vectorizing with 12 runtime checks.
+; We are vectorizing with 12 runtime checks.
 ;CHECK-LABEL: func2x6(
-;CHECK-NOT: <4 x i32>
+;CHECK: <4 x i32>
 ;CHECK: ret
-; We vectorize with 12 checks if a vectorization hint is provided.
-;OVERRIDE-LABEL: func2x6(
-;OVERRIDE-NOT: <4 x i32>
-;OVERRIDE: ret
 define i32 @func2x6(i32* nocapture %out, i32* nocapture %out2, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
 entry:
   br label %for.body
@@ -100,4 +87,3 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
   ret i32 undef
 }
-