[llvm] [LV]Enable max safe distance in predicated DataWithEVL vectorization mode. (PR #100755)

Fri Nov 22 10:37:21 PST 2024

https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/100755

>From 4c16b87b90aee4bebc1b6640af554fd449565d8d Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 26 Jul 2024 14:38:27 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../llvm/Analysis/LoopAccessAnalysis.h        |  25 ++
 .../Vectorize/LoopVectorizationLegality.h     |  12 +
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |  55 ++++-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  75 ++++--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   5 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  11 +-
 .../Transforms/Vectorize/VPlanTransforms.h    |   4 +-
 .../LoopAccessAnalysis/depend_diff_types.ll   |   4 +-
 .../forward-loop-independent.ll               |   5 +-
 .../forward-negative-step.ll                  |   5 +-
 .../num-iters-for-store-load-conflict.ll      |  40 ++--
 .../Analysis/LoopAccessAnalysis/pr64637.ll    |   5 +-
 .../stride-access-dependence.ll               |   5 +-
 .../RISCV/riscv-vector-reverse.ll             |   4 +-
 ...e-force-tail-with-evl-safe-dep-distance.ll | 225 +++++++++++++-----
 .../LoopVectorize/memory-dep-remarks.ll       |  25 +-
 16 files changed, 359 insertions(+), 146 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index cc40d2e83f2e0a..b661e117d01ee7 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -37,6 +37,8 @@ class Value;
 struct VectorizerParams {
   /// Maximum SIMD width.
   static const unsigned MaxVectorWidth;
+  /// Maximum LMUL factor.
+  static const unsigned MaxVectorLMUL;
 
   /// VF as overridden by the user.
   static unsigned VectorizationFactor;
@@ -222,6 +224,23 @@ class MemoryDepChecker {
     return MaxSafeVectorWidthInBits;
   }
 
+  /// Return safe power-of-2 number of elements, which do not prevent store-load
+  /// forwarding.
+  std::optional<uint64_t> getStoreLoadForwardSafeVFPowerOf2() const {
+    if (MaxStoreLoadForwardSafeVF.first == std::numeric_limits<uint64_t>::max())
+      return std::nullopt;
+    return MaxStoreLoadForwardSafeVF.first;
+  }
+
+  /// Return safe non-power-of-2 number of elements, which do not prevent
+  /// store-load forwarding.
+  std::optional<uint64_t> getStoreLoadForwardSafeVFNonPowerOf2() const {
+    if (MaxStoreLoadForwardSafeVF.second ==
+        std::numeric_limits<uint64_t>::max())
+      return std::nullopt;
+    return MaxStoreLoadForwardSafeVF.second;
+  }
+
   /// In same cases when the dependency check fails we can still
   /// vectorize the loop with a dynamic array access check.
   bool shouldRetryWithRuntimeCheck() const {
@@ -310,6 +329,12 @@ class MemoryDepChecker {
   /// restrictive.
   uint64_t MaxSafeVectorWidthInBits = -1U;
 
+  /// Maximum number of elements (power-of-2 and non-power-of-2), which do not
+  /// prevent store-load forwarding.
+  std::pair<uint64_t, uint64_t> MaxStoreLoadForwardSafeVF =
+      std::make_pair(std::numeric_limits<uint64_t>::max(),
+                     std::numeric_limits<uint64_t>::max());
+
   /// If we see a non-constant dependence distance we can still try to
   /// vectorize this loop with runtime checks.
   bool FoundNonConstantDistanceDependence = false;
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 0f4d1355dd2bfe..c16a5f9a1344c2 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -377,6 +377,18 @@ class LoopVectorizationLegality {
     return LAI->getDepChecker().getMaxSafeVectorWidthInBits();
   }
 
+  /// Return safe power-of-2 number of elements, which do not prevent store-load
+  /// forwarding.
+  std::optional<unsigned> getMaxStoreLoadForwardSafeVFPowerOf2() const {
+    return LAI->getDepChecker().getStoreLoadForwardSafeVFPowerOf2();
+  }
+
+  /// Return safe non-power-of-2 number of elements, which do not prevent
+  /// store-load forwarding.
+  std::optional<unsigned> getMaxStoreLoadForwardSafeVFNonPowerOf2() const {
+    return LAI->getDepChecker().getStoreLoadForwardSafeVFNonPowerOf2();
+  }
+
   /// Returns true if vector representation of the instruction \p I
   /// requires mask.
   bool isMaskRequired(const Instruction *I) const {
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 646d2f7ef30773..29816bd1d845cc 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -100,6 +100,8 @@ static cl::opt<unsigned> MemoryCheckMergeThreshold(
 
 /// Maximum SIMD width.
 const unsigned VectorizerParams::MaxVectorWidth = 64;
+/// Maximum LMUL factor.
+const unsigned VectorizerParams::MaxVectorLMUL = 8;
 
 /// We collect dependences up to this threshold.
 static cl::opt<unsigned>
@@ -1764,31 +1766,64 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
   // cause any slowdowns.
   const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
   // Maximum vector factor.
-  uint64_t MaxVFWithoutSLForwardIssues = std::min(
-      VectorizerParams::MaxVectorWidth * TypeByteSize, MinDepDistBytes);
+  uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
+      std::min(VectorizerParams::MaxVectorWidth * TypeByteSize,
+               MaxStoreLoadForwardSafeVF.first);
+  uint64_t MaxVFWithoutSLForwardIssuesNonPowerOf2 =
+      std::min(VectorizerParams::MaxVectorLMUL *
+                   VectorizerParams::MaxVectorWidth * TypeByteSize,
+               MaxStoreLoadForwardSafeVF.second);
 
   // Compute the smallest VF at which the store and load would be misaligned.
-  for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
-       VF *= 2) {
+  for (uint64_t VF = 2 * TypeByteSize;
+       VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) {
     // If the number of vector iteration between the store and the load are
     // small we could incur conflicts.
     if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
-      MaxVFWithoutSLForwardIssues = (VF >> 1);
+      MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1);
+      break;
+    }
+  }
+  // RISCV VLA supports non-power-2 vector factor. So, we iterate in a
+  // backward order to find largest VF, which allows aligned stores-loads or
+  // the number of iterations between conflicting memory addresses is not less
+  // than 8 (NumItersForStoreLoadThroughMemory).
+  for (uint64_t VF = MaxVFWithoutSLForwardIssuesNonPowerOf2,
+                E = 2 * TypeByteSize;
+       VF >= E; VF -= TypeByteSize) {
+    if (Distance % VF == 0 ||
+        Distance / VF >= NumItersForStoreLoadThroughMemory) {
+      uint64_t GCD = MaxStoreLoadForwardSafeVF.second ==
+                             std::numeric_limits<uint64_t>::max()
+                         ? VF
+                         : std::gcd(MaxStoreLoadForwardSafeVF.second, VF);
+      MaxVFWithoutSLForwardIssuesNonPowerOf2 = GCD;
       break;
     }
   }
 
-  if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
+  if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize &&
+      MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize) {
     LLVM_DEBUG(
         dbgs() << "LAA: Distance " << Distance
                << " that could cause a store-load forwarding conflict\n");
     return true;
   }
 
-  if (MaxVFWithoutSLForwardIssues < MinDepDistBytes &&
-      MaxVFWithoutSLForwardIssues !=
-          VectorizerParams::MaxVectorWidth * TypeByteSize)
-    MinDepDistBytes = MaxVFWithoutSLForwardIssues;
+  if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize)
+    MaxStoreLoadForwardSafeVF.first = 1;
+  else if (MaxVFWithoutSLForwardIssuesPowerOf2 <
+               MaxStoreLoadForwardSafeVF.first &&
+           MaxVFWithoutSLForwardIssuesPowerOf2 !=
+               VectorizerParams::MaxVectorWidth * TypeByteSize)
+    MaxStoreLoadForwardSafeVF.first = MaxVFWithoutSLForwardIssuesPowerOf2;
+  if (MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize)
+    MaxStoreLoadForwardSafeVF.second = 1;
+  else if (MaxVFWithoutSLForwardIssuesNonPowerOf2 <
+               MaxStoreLoadForwardSafeVF.second &&
+           MaxVFWithoutSLForwardIssuesNonPowerOf2 !=
+               VectorizerParams::MaxVectorWidth * TypeByteSize)
+    MaxStoreLoadForwardSafeVF.second = MaxVFWithoutSLForwardIssuesNonPowerOf2;
   return false;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09ca859f526808..28e814e9c89e9c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1444,9 +1444,8 @@ class LoopVectorizationCostModel {
 
   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
   /// overflow or not.
-  /// \param IsScalableVF true if scalable vector factors enabled.
   /// \param UserIC User specific interleave count.
-  void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
+  void setTailFoldingStyles(unsigned UserIC) {
     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
     if (!Legal->canFoldTailByMasking()) {
       ChosenTailFoldingStyle =
@@ -1470,11 +1469,9 @@ class LoopVectorizationCostModel {
     // FIXME: use actual opcode/data type for analysis here.
     // FIXME: Investigate opportunity for fixed vector factor.
     bool EVLIsLegal =
-        IsScalableVF && UserIC <= 1 &&
+        UserIC <= 1 &&
         TTI.hasActiveVectorLength(0, nullptr, Align()) &&
-        !EnableVPlanNativePath &&
-        // FIXME: implement support for max safe dependency distance.
-        Legal->isSafeForAnyVectorWidth();
+        !EnableVPlanNativePath;
     if (!EVLIsLegal) {
       // If for some reason EVL mode is unsupported, fallback to
       // DataWithoutLaneMask to try to vectorize the loop with folded tail
@@ -1492,6 +1489,14 @@ class LoopVectorizationCostModel {
     }
   }
 
+  /// Disables previously chosen tail folding policy, sets it to None. Expects,
+  /// that the tail policy was selected.
+  void disableTailFolding() {
+    assert(ChosenTailFoldingStyle && "Tail folding must be selected.");
+    ChosenTailFoldingStyle =
+        std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+  }
+
   /// Returns true if all loop blocks should be masked to fold tail loop.
   bool foldTailByMasking() const {
     // TODO: check if it is possible to check for None style independent of
@@ -1499,6 +1504,14 @@ class LoopVectorizationCostModel {
     return getTailFoldingStyle() != TailFoldingStyle::None;
   }
 
+  /// Return maximum safe number of elements to be processed, which do not
+  /// prevent store-load forwarding.
+  /// TODO: need to consider adjusting cost model to use this value as a
+  /// vectorization factor for EVL-based vectorization.
+  std::optional<unsigned> getMaxEVLSafeElements() const {
+    return MaxEVLSafeElements;
+  }
+
   /// Returns true if the instructions in this block requires predication
   /// for any reason, e.g. because tail folding now requires a predicate
   /// or because the block in the original loop was predicated.
@@ -1654,6 +1667,10 @@ class LoopVectorizationCostModel {
   /// true if scalable vectorization is supported and enabled.
   std::optional<bool> IsScalableVectorizationAllowed;
 
+  /// Maximum safe number of elements to be processed, which do not
+  /// prevent store-load forwarding.
+  std::optional<unsigned> MaxEVLSafeElements;
+
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
   /// instruction will be scalarized when vectorizing with the associated
@@ -3903,11 +3920,31 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
   // the memory accesses that is most restrictive (involved in the smallest
   // dependence distance).
-  unsigned MaxSafeElements =
-      llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
+  unsigned MaxSafeElements = Legal->getMaxSafeVectorWidthInBits() / WidestType;
+  if (Legal->isSafeForAnyVectorWidth())
+    MaxSafeElements = PowerOf2Ceil(MaxSafeElements);
+  unsigned MaxFixedSafeElements = std::gcd(
+      MaxSafeElements,
+      Legal->getMaxStoreLoadForwardSafeVFPowerOf2().value_or(MaxSafeElements));
+  MaxFixedSafeElements = bit_floor(MaxFixedSafeElements);
+  unsigned MaxScalableSafeElements = MaxFixedSafeElements;
+  if (foldTailWithEVL()) {
+    MaxScalableSafeElements = std::numeric_limits<unsigned>::max();
+    std::optional<unsigned> SafeStoreLoadForwarding =
+        Legal->getMaxStoreLoadForwardSafeVFNonPowerOf2();
+    if (!Legal->isSafeForAnyVectorWidth() || SafeStoreLoadForwarding) {
+      unsigned SLForwardDist =
+          Legal->getMaxStoreLoadForwardSafeVFNonPowerOf2().value_or(
+              MaxSafeElements);
+      if (MaxSafeElements >= SLForwardDist)
+        MaxEVLSafeElements = SLForwardDist;
+      else
+        MaxEVLSafeElements = std::gcd(MaxSafeElements, SLForwardDist);
+    }
+  }
 
-  auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
-  auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
+  auto MaxSafeFixedVF = ElementCount::getFixed(MaxFixedSafeElements);
+  auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxScalableSafeElements);
 
   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
                     << ".\n");
@@ -4077,7 +4114,13 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
   }
 
-  FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
+  // If we don't know the precise trip count, or if the trip count that we
+  // found modulo the vectorization factor is not zero, try to fold the tail
+  // by masking.
+  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+  setTailFoldingStyles(UserIC);
+  FixedScalableVFPair MaxFactors =
+      computeFeasibleMaxVF(MaxTC, UserVF, foldTailByMasking());
 
   // Avoid tail folding if the trip count is known to be a multiple of any VF
   // we choose.
@@ -4108,15 +4151,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     if (Rem->isZero()) {
       // Accept MaxFixedVF if we do not have a tail.
       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+      disableTailFolding();
       return MaxFactors;
     }
   }
 
-  // If we don't know the precise trip count, or if the trip count that we
-  // found modulo the vectorization factor is not zero, try to fold the tail
-  // by masking.
-  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
   if (foldTailByMasking()) {
     if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
       LLVM_DEBUG(
@@ -8388,8 +8427,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
       VPlanTransforms::optimize(*Plan, *PSE.getSE());
       // TODO: try to put it close to addActiveLaneMask().
       // Discard the plan if it is not EVL-compatible
-      if (CM.foldTailWithEVL() &&
-          !VPlanTransforms::tryAddExplicitVectorLength(*Plan))
+      if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
+                                      *Plan, CM.getMaxEVLSafeElements()))
         break;
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2d6d67a55c17d3..de24688593ebe9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -471,6 +471,11 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
       assert(State.VF.isScalable() && "Expected scalable vector factor.");
       Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
 
+      if (getNumOperands() == 3) {
+        Value *MaxSafeVF = State.get(getOperand(2), VPIteration(0, 0));
+        AVL = State.Builder.CreateBinaryIntrinsic(Intrinsic::umin, AVL,
+                                                  MaxSafeVF);
+      }
       Value *EVL = State.Builder.CreateIntrinsic(
           State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
           {AVL, VFArg, State.Builder.getTrue()});
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c91fd0f118e311..e703bb893d9383 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1427,7 +1427,8 @@ void VPlanTransforms::addActiveLaneMask(
 /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
 /// ...
 ///
-bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
+bool VPlanTransforms::tryAddExplicitVectorLength(
+    VPlan &Plan, const std::optional<unsigned> &MaxEVLSafeElements) {
   VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
   // The transform updates all users of inductions to work based on EVL, instead
   // of the VF directly. At the moment, widened inductions cannot be updated, so
@@ -1452,8 +1453,12 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
   // Create the ExplicitVectorLengthPhi recipe in the main loop.
   auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
   EVLPhi->insertAfter(CanonicalIVPHI);
-  auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength,
-                                  {EVLPhi, Plan.getTripCount()});
+  SmallVector<VPValue *, 3> Operands = {EVLPhi, Plan.getTripCount()};
+  if (MaxEVLSafeElements)
+    Operands.push_back(Plan.getOrAddLiveIn(ConstantInt::get(
+        CanonicalIVPHI->getScalarType(), *MaxEVLSafeElements)));
+  auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, Operands,
+                                  DebugLoc());
   VPEVL->insertBefore(*Header, Header->getFirstNonPhi());
 
   auto *CanonicalIVIncrement =
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 96b8a6639723c2..8158c832f1a951 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -105,7 +105,9 @@ struct VPlanTransforms {
   /// VPCanonicalIVPHIRecipe is only used to control the loop after
   /// this transformation.
   /// \returns true if the transformation succeeds, or false if it doesn't.
-  static bool tryAddExplicitVectorLength(VPlan &Plan);
+  static bool
+  tryAddExplicitVectorLength(VPlan &Plan,
+                             const std::optional<unsigned> &MaxEVLSafeElements);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
index 81d8b01fe7fb72..c5ba25a5c0ace5 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
@@ -140,11 +140,11 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
 ; CHECK-NEXT:            %ld.i64 = load i64, ptr %gep.iv, align 8 ->
 ; CHECK-NEXT:            store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
 ; CHECK-EMPTY:
-; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT:        BackwardVectorizable:
 ; CHECK-NEXT:            %ld.f64 = load double, ptr %gep.iv, align 8 ->
 ; CHECK-NEXT:            store double %val, ptr %gep.iv.101.i64, align 8
 ; CHECK-EMPTY:
-; CHECK-NEXT:        ForwardButPreventsForwarding:
+; CHECK-NEXT:        Forward:
 ; CHECK-NEXT:            store double %val, ptr %gep.iv.101.i64, align 8 ->
 ; CHECK-NEXT:            %ld.i64 = load i64, ptr %gep.iv, align 8
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
index 7fc9958dba552b..6e4bcec013a73a 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
@@ -24,14 +24,13 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define void @f(ptr noalias %A, ptr noalias %B, ptr noalias %C, i64 %N) {
 ; CHECK-LABEL: 'f'
 ; CHECK-NEXT:    for.body:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:        Forward:
 ; CHECK-NEXT:            store i32 %b_p1, ptr %Aidx, align 4 ->
 ; CHECK-NEXT:            %a = load i32, ptr %Aidx, align 4
 ; CHECK-EMPTY:
-; CHECK-NEXT:        ForwardButPreventsForwarding:
+; CHECK-NEXT:        Forward:
 ; CHECK-NEXT:            store i32 %b_p2, ptr %Aidx_next, align 4 ->
 ; CHECK-NEXT:            %a = load i32, ptr %Aidx, align 4
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll
index 46e81cd74ab31f..f182a3d2aa7d17 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll
@@ -47,10 +47,9 @@ exit:
 define void @neg_step_ForwardButPreventsForwarding(ptr nocapture %A, ptr noalias %B) {
 ; CHECK-LABEL: 'neg_step_ForwardButPreventsForwarding'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        ForwardButPreventsForwarding:
+; CHECK-NEXT:        Forward:
 ; CHECK-NEXT:            store i32 0, ptr %gep.A, align 4 ->
 ; CHECK-NEXT:            %l = load i32, ptr %gep.A.plus.1, align 4
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/num-iters-for-store-load-conflict.ll b/llvm/test/Analysis/LoopAccessAnalysis/num-iters-for-store-load-conflict.ll
index d3eda21dee27e6..3bb51553692056 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/num-iters-for-store-load-conflict.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/num-iters-for-store-load-conflict.ll
@@ -4,10 +4,9 @@
 define void @forward_dist_7(ptr %A, ptr noalias %B) {
 ; CHECK-LABEL: 'forward_dist_7'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        ForwardButPreventsForwarding:
+; CHECK-NEXT:        Forward:
 ; CHECK-NEXT:            store i32 0, ptr %gep.2, align 4 ->
 ; CHECK-NEXT:            %l = load i32, ptr %gep.1, align 4
 ; CHECK-EMPTY:
@@ -40,10 +39,9 @@ exit:
 define void @forward_dist_9(ptr %A, ptr noalias %B) {
 ; CHECK-LABEL: 'forward_dist_9'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        ForwardButPreventsForwarding:
+; CHECK-NEXT:        Forward:
 ; CHECK-NEXT:            store i32 0, ptr %gep.2, align 4 ->
 ; CHECK-NEXT:            %l = load i32, ptr %gep.1, align 4
 ; CHECK-EMPTY:
@@ -76,10 +74,9 @@ exit:
 define void @forward_dist_11(ptr %A, ptr noalias %B) {
 ; CHECK-LABEL: 'forward_dist_11'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        ForwardButPreventsForwarding:
+; CHECK-NEXT:        Forward:
 ; CHECK-NEXT:            store i32 0, ptr %gep.2, align 4 ->
 ; CHECK-NEXT:            %l = load i32, ptr %gep.1, align 4
 ; CHECK-EMPTY:
@@ -112,10 +109,9 @@ exit:
 define void @forward_dist_13(ptr %A, ptr noalias %B) {
 ; CHECK-LABEL: 'forward_dist_13'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        ForwardButPreventsForwarding:
+; CHECK-NEXT:        Forward:
 ; CHECK-NEXT:            store i32 0, ptr %gep.2, align 4 ->
 ; CHECK-NEXT:            %l = load i32, ptr %gep.1, align 4
 ; CHECK-EMPTY:
@@ -148,10 +144,9 @@ exit:
 define void @forward_dist_15(ptr %A, ptr noalias %B) {
 ; CHECK-LABEL: 'forward_dist_15'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        ForwardButPreventsForwarding:
+; CHECK-NEXT:        Forward:
 ; CHECK-NEXT:            store i32 0, ptr %gep.2, align 4 ->
 ; CHECK-NEXT:            %l = load i32, ptr %gep.1, align 4
 ; CHECK-EMPTY:
@@ -184,10 +179,9 @@ exit:
 define void @forward_dist_17(ptr %A, ptr noalias %B) {
 ; CHECK-LABEL: 'forward_dist_17'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        ForwardButPreventsForwarding:
+; CHECK-NEXT:        Forward:
 ; CHECK-NEXT:            store i32 0, ptr %gep.2, align 4 ->
 ; CHECK-NEXT:            %l = load i32, ptr %gep.1, align 4
 ; CHECK-EMPTY:
@@ -220,10 +214,9 @@ exit:
 define void @forward_dist_19(ptr %A, ptr noalias %B) {
 ; CHECK-LABEL: 'forward_dist_19'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        ForwardButPreventsForwarding:
+; CHECK-NEXT:        Forward:
 ; CHECK-NEXT:            store i32 0, ptr %gep.2, align 4 ->
 ; CHECK-NEXT:            %l = load i32, ptr %gep.1, align 4
 ; CHECK-EMPTY:
@@ -258,10 +251,9 @@ exit:
 define void @unknown_loop_bounds(i64 %x, i64 %y) {
 ; CHECK-LABEL: 'unknown_loop_bounds'
 ; CHECK-NEXT:    inner:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 2368 bits
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT:        BackwardVectorizable:
 ; CHECK-NEXT:            %l = load double, ptr %gep.0, align 8 ->
 ; CHECK-NEXT:            store double %l, ptr %gep.1, align 8
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll b/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll
index d3e589cf99cf36..4c0fb6adce1a80 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll
@@ -20,10 +20,9 @@
 define void @foo(ptr noalias nocapture noundef %y, ptr noalias nocapture noundef readnone %x, ptr noalias nocapture noundef readonly %indices, i32 noundef %n) {
 ; CHECK-LABEL: 'foo'
 ; CHECK-NEXT:    for.body:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT:  Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 96 bits
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT:        BackwardVectorizable:
 ; CHECK-NEXT:            %1 = load i32, ptr %arrayidx, align 4 ->
 ; CHECK-NEXT:            store i32 %add8, ptr %arrayidx12, align 4
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
index ef19e173b65994..6aeebebb057665 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
@@ -416,10 +416,9 @@ for.body:                                         ; preds = %entry, %for.body
 define void @vectorizable_unscaled_Read_Write(ptr nocapture %A) {
 ; CHECK-LABEL: 'vectorizable_unscaled_Read_Write'
 ; CHECK-NEXT:    for.body:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 32 bits
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT:        BackwardVectorizable:
 ; CHECK-NEXT:            %0 = load i32, ptr %arrayidx, align 4 ->
 ; CHECK-NEXT:            store i32 %add, ptr %arrayidx2, align 4
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 3a14842580425b..4906cc1ca30421 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -21,7 +21,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Found trip count: 0
 ; CHECK-NEXT:  LV: Scalable vectorization is available
-; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
+; CHECK-NEXT:  LV: The max safe fixed VF is: 134217728.
 ; CHECK-NEXT:  LV: The max safe scalable VF is: vscale x 4294967295.
 ; CHECK-NEXT:  LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
@@ -226,7 +226,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Found trip count: 0
 ; CHECK-NEXT:  LV: Scalable vectorization is available
-; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
+; CHECK-NEXT:  LV: The max safe fixed VF is: 134217728.
 ; CHECK-NEXT:  LV: The max safe scalable VF is: vscale x 4294967295.
 ; CHECK-NEXT:  LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
index 2dd47d5c1ea8a7..6143ca8de8b346 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll
@@ -103,24 +103,38 @@ define void @test_may_clobber1(ptr %p) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; IF-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32
-; IF-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 100
-; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 100)
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add i64 [[TMP8]], 100
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP13]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
 ; IF-EVL:       loop:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -192,17 +206,51 @@ exit:
 define void @test_may_clobber2(ptr %p) {
 ; IF-EVL-LABEL: @test_may_clobber2(
 ; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 9)
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add i64 [[TMP8]], 9
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP13]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
 ; IF-EVL:       loop:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
 ; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
 ; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 9
 ; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
 ; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -245,24 +293,38 @@ define void @test_may_clobber3(ptr %p) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 200, [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; IF-EVL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32
-; IF-EVL-NEXT:    [[TMP3:%.*]] = add i64 [[TMP0]], 10
-; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
-; IF-EVL-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 200, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 10)
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add i64 [[TMP8]], 10
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP13]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
 ; IF-EVL:       loop:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -273,7 +335,7 @@ define void @test_may_clobber3(ptr %p) {
 ; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -363,7 +425,7 @@ define void @trivial_due_max_vscale(ptr %p) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
@@ -378,7 +440,7 @@ define void @trivial_due_max_vscale(ptr %p) {
 ; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -422,28 +484,38 @@ define void @no_high_lmul_or_interleave(ptr %p) {
 ; IF-EVL-NEXT:  entry:
 ; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 3002, [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], <i64 3001, i64 3001, i64 3001, i64 3001>
-; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP3]], i32 32, <4 x i1> [[TMP1]], <4 x i64> poison)
-; IF-EVL-NEXT:    [[TMP4:%.*]] = add i64 [[TMP0]], 1024
-; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP4]]
-; IF-EVL-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; IF-EVL-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_MASKED_LOAD]], ptr [[TMP6]], i32 32, <4 x i1> [[TMP1]])
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; IF-EVL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3004
-; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 3002, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 1024)
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 32 [[TMP10]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add i64 [[TMP8]], 1024
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP_LOAD]], ptr align 32 [[TMP13]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 3004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
 ; IF-EVL:       loop:
 ; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -454,7 +526,7 @@ define void @no_high_lmul_or_interleave(ptr %p) {
 ; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 3001
-; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -495,22 +567,63 @@ exit:
 define void @non-power-2-storeloadforward(ptr %A) {
 ; IF-EVL-LABEL: @non-power-2-storeloadforward(
 ; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 112, [[TMP2]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[IND_END:%.*]] = add i64 16, [[N_VEC]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = sub i64 112, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP5]], i64 3)
+; IF-EVL-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP6]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = add i64 16, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add nsw i64 [[TMP8]], -3
+; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add nsw i64 [[TMP8]], 4
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP15]], ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 16, [[ENTRY:%.*]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 16, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[TMP0:%.*]] = add nsw i64 [[IV]], -3
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[TMP2:%.*]] = add nsw i64 [[IV]], 4
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
-; IF-EVL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP3]], [[TMP1]]
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = add nsw i64 [[IV]], -3
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[TMP22:%.*]] = add nsw i64 [[IV]], 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP23]], [[TMP21]]
 ; IF-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
 ; IF-EVL-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX5]], align 4
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; IF-EVL-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32
 ; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], 128
-; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP15:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll b/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll
index d96d85512621c6..6945afd09dd072 100644
--- a/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll
+++ b/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll
@@ -194,9 +194,7 @@ for.body:                                         ; preds = %for.body.preheader,
 ;   }
 ; }
 
-; CHECK: remark: source.c:61:12: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding. Memory location is the same as accessed at source.c:60:5
-
+; CHECK-NOT: remark: source.c:{{0-9]+}}:{{[0-9]+}}:
 define void @test_forwardButPreventsForwarding_dep(i64 %n, ptr nocapture %A, ptr nocapture %B) !dbg !166 {
 entry:
   %cmp11 = icmp sgt i64 %n, 3
@@ -233,7 +231,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; }
 
 ; CHECK: remark: source.c:74:5: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK: Backward loop carried data dependence that prevents store-to-load forwarding. Memory location is the same as accessed at source.c:74:21
+; CHECK: Backward loop carried data dependence that prevents store-to-load forwarding.
 
 define void @test_backwardVectorizableButPreventsForwarding(i64 %n, ptr nocapture %A) !dbg !189 {
 entry:
@@ -328,25 +326,19 @@ for.body:                                         ; preds = %entry, %for.body
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          loop not vectorized
 ; YAML-NEXT: ...
-; YAML-NEXT: --- !Analysis
+; YAML-NEXT: --- !Missed
 ; YAML-NEXT: Pass:            loop-vectorize
-; YAML-NEXT: Name:            UnsafeDep
-; YAML-NEXT: DebugLoc:        { File: source.c, Line: 61, Column: 12 }
+; YAML-NEXT: Name:            VectorizationNotBeneficial
 ; YAML-NEXT: Function:        test_forwardButPreventsForwarding_dep
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'loop not vectorized: '
-; YAML-NEXT:   - String:          'unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop'
-; YAML-NEXT:   - String:          "\nForward loop carried data dependence that prevents store-to-load forwarding."
-; YAML-NEXT:   - String:          ' Memory location is the same as accessed at '
-; YAML-NEXT:   - Location:        'source.c:60:5'
-; YAML-NEXT:     DebugLoc:        { File: source.c, Line: 60, Column: 5 }
+; YAML-NEXT:   - String:          the cost-model indicates that vectorization is not beneficial
 ; YAML-NEXT: ...
 ; YAML-NEXT: --- !Missed
 ; YAML-NEXT: Pass:            loop-vectorize
-; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: Name:            InterleavingNotBeneficial
 ; YAML-NEXT: Function:        test_forwardButPreventsForwarding_dep
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT:   - String:          the cost-model indicates that interleaving is not beneficial
 ; YAML-NEXT: ...
 ; YAML-NEXT: --- !Analysis
 ; YAML-NEXT: Pass:            loop-vectorize
@@ -357,9 +349,6 @@ for.body:                                         ; preds = %entry, %for.body
 ; YAML-NEXT:   - String:          'loop not vectorized: '
 ; YAML-NEXT:   - String:          'unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop'
 ; YAML-NEXT:   - String:          "\nBackward loop carried data dependence that prevents store-to-load forwarding."
-; YAML-NEXT:   - String:          ' Memory location is the same as accessed at '
-; YAML-NEXT:   - Location:        'source.c:74:21'
-; YAML-NEXT:     DebugLoc:        { File: source.c, Line: 74, Column: 21 }
 ; YAML-NEXT: ...
 ; YAML-NEXT: --- !Missed
 ; YAML-NEXT: Pass:            loop-vectorize