[llvm] 51d648c - Revert "[LV] Calculate max feasible scalable VF."

Thu Apr 29 08:04:55 PDT 2021

Author: Sander de Smalen
Date: 2021-04-29T16:04:37+01:00
New Revision: 51d648c119d7773ce6fb809353bd6bd14bca8818

URL: https://github.com/llvm/llvm-project/commit/51d648c119d7773ce6fb809353bd6bd14bca8818
DIFF: https://github.com/llvm/llvm-project/commit/51d648c119d7773ce6fb809353bd6bd14bca8818.diff

LOG: Revert "[LV] Calculate max feasible scalable VF."

Temporarily reverting this patch due to some unexpected issue found
by one of the PPC buildbots.

This reverts commit 584e9b6e4b4987b882719923e640eed854613d91.

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
    llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll

Removed: 
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll


################################################################################
diff  --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
index ad6a4b561a9bb..ecb44a7b1518d 100644

--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -174,13 +174,6 @@ void reportVectorizationFailure(const StringRef DebugMsg,
     const StringRef OREMsg, const StringRef ORETag,
     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr);
 
-/// Reports an informative message: print \p Msg for debugging purposes as well
-/// as an optimization remark. Uses either \p I as location of the remark, or
-/// otherwise \p TheLoop.
-void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag,
-                             OptimizationRemarkEmitter *ORE, Loop *TheLoop,
-                             Instruction *I = nullptr);
-
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZE_H

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0fe92b3685e54..f8f47c475088e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1065,13 +1065,13 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
     B.SetCurrentDebugLocation(DebugLoc());
 }
 
-/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
-/// is passed, the message relates to that particular instruction.
+/// Write a record \p DebugMsg about vectorization failure to the debug
+/// output stream. If \p I is passed, it is an instruction that prevents
+/// vectorization.
 #ifndef NDEBUG
-static void debugVectorizationMessage(const StringRef Prefix,
-                                      const StringRef DebugMsg,
-                                      Instruction *I) {
-  dbgs() << "LV: " << Prefix << DebugMsg;
+static void debugVectorizationFailure(const StringRef DebugMsg,
+    Instruction *I) {
+  dbgs() << "LV: Not vectorizing: " << DebugMsg;
   if (I != nullptr)
     dbgs() << " " << *I;
   else
@@ -1100,7 +1100,9 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
       DL = I->getDebugLoc();
   }
 
-  return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
+  OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
+  R << "loop not vectorized: ";
+  return R;
 }
 
 /// Return a value for Step multiplied by VF.
@@ -1121,24 +1123,12 @@ Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
 }
 
 void reportVectorizationFailure(const StringRef DebugMsg,
-                                const StringRef OREMsg, const StringRef ORETag,
-                                OptimizationRemarkEmitter *ORE, Loop *TheLoop,
-                                Instruction *I) {
-  LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
-  LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
-  ORE->emit(
-      createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
-      << "loop not vectorized: " << OREMsg);
-}
-
-void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
-                             OptimizationRemarkEmitter *ORE, Loop *TheLoop,
-                             Instruction *I) {
-  LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
+    const StringRef OREMsg, const StringRef ORETag,
+    OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
+  LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
-  ORE->emit(
-      createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
-      << Msg);
+  ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
+                ORETag, TheLoop, I) << OREMsg);
 }
 
 } // end namespace llvm
@@ -1633,18 +1623,6 @@ class LoopVectorizationCostModel {
   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
                                     ElementCount UserVF);
 
-  /// \return the maximized element count based on the targets vector
-  /// registers and the loop trip-count, but limited to a maximum safe VF.
-  /// This is a helper function of computeFeasibleMaxVF.
-  ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
-                                       unsigned SmallestType,
-                                       unsigned WidestType,
-                                       ElementCount MaxSafeVF);
-
-  /// \return the maximum legal scalable VF, based on the safe max number
-  /// of elements.
-  ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
-
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
   /// operate on
@@ -5598,129 +5576,6 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
   return false;
 }
 
-ElementCount
-LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
-  if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
-    reportVectorizationInfo(
-        "Disabling scalable vectorization, because target does not "
-        "support scalable vectors.",
-        "ScalableVectorsUnsupported", ORE, TheLoop);
-    return ElementCount::getScalable(0);
-  }
-
-  auto MaxScalableVF = ElementCount::getScalable(1u << 16);
-
-  // Disable scalable vectorization if the loop contains unsupported reductions.
-  // Test that the loop-vectorizer can legalize all operations for this MaxVF.
-  // FIXME: While for scalable vectors this is currently sufficient, this should
-  // be replaced by a more detailed mechanism that filters out specific VFs,
-  // instead of invalidating vectorization for a whole set of VFs based on the
-  // MaxVF.
-  if (!canVectorizeReductions(MaxScalableVF)) {
-    reportVectorizationInfo(
-        "Scalable vectorization not supported for the reduction "
-        "operations found in this loop.",
-        "ScalableVFUnfeasible", ORE, TheLoop);
-    return ElementCount::getScalable(0);
-  }
-
-  if (Legal->isSafeForAnyVectorWidth())
-    return MaxScalableVF;
-
-  // Limit MaxScalableVF by the maximum safe dependence distance.
-  Optional<unsigned> MaxVScale = TTI.getMaxVScale();
-  MaxScalableVF = ElementCount::getScalable(
-      MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
-  if (!MaxScalableVF)
-    reportVectorizationInfo(
-        "Max legal vector width too small, scalable vectorization "
-        "unfeasible.",
-        "ScalableVFUnfeasible", ORE, TheLoop);
-
-  return MaxScalableVF;
-}
-
-ElementCount
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
-                                                 ElementCount UserVF) {
-  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
-  unsigned SmallestType, WidestType;
-  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
-
-  // Get the maximum safe dependence distance in bits computed by LAA.
-  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
-  // the memory accesses that is most restrictive (involved in the smallest
-  // dependence distance).
-  unsigned MaxSafeElements =
-      PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
-
-  auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
-  auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
-
-  LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
-                    << ".\n");
-  LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
-                    << ".\n");
-
-  // First analyze the UserVF, fall back if the UserVF should be ignored.
-  if (UserVF) {
-    auto MaxSafeUserVF =
-        UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
-
-    if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF))
-      return UserVF;
-
-    assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
-
-    // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
-    // is better to ignore the hint and let the compiler choose a suitable VF.
-    if (!UserVF.isScalable()) {
-      LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
-                        << " is unsafe, clamping to max safe VF="
-                        << MaxSafeFixedVF << ".\n");
-      ORE->emit([&]() {
-        return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
-                                          TheLoop->getStartLoc(),
-                                          TheLoop->getHeader())
-               << "User-specified vectorization factor "
-               << ore::NV("UserVectorizationFactor", UserVF)
-               << " is unsafe, clamping to maximum safe vectorization factor "
-               << ore::NV("VectorizationFactor", MaxSafeFixedVF);
-      });
-      return MaxSafeFixedVF;
-    }
-
-    LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
-                      << " is unsafe. Ignoring scalable UserVF.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
-                                        TheLoop->getStartLoc(),
-                                        TheLoop->getHeader())
-             << "User-specified vectorization factor "
-             << ore::NV("UserVectorizationFactor", UserVF)
-             << " is unsafe. Ignoring the hint to let the compiler pick a "
-                "suitable VF.";
-    });
-  }
-
-  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
-                    << " / " << WidestType << " bits.\n");
-
-  ElementCount MaxFixedVF = ElementCount::getFixed(1);
-  if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
-                                           WidestType, MaxSafeFixedVF))
-    MaxFixedVF = MaxVF;
-
-  if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
-                                           WidestType, MaxSafeScalableVF))
-    // FIXME: Return scalable VF as well (to be added in future patch).
-    if (MaxVF.isScalable())
-      LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
-                        << "\n");
-
-  return MaxFixedVF;
-}
-
 Optional<ElementCount>
 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
@@ -5861,61 +5716,149 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   return None;
 }
 
-ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
-    unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
-    ElementCount MaxSafeVF) {
-  bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
-  TypeSize WidestRegister = TTI.getRegisterBitWidth(
-      ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
-                           : TargetTransformInfo::RGK_FixedWidthVector);
+ElementCount
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
+                                                 ElementCount UserVF) {
+  bool IgnoreScalableUserVF = UserVF.isScalable() &&
+                              !TTI.supportsScalableVectors() &&
+                              !ForceTargetSupportsScalableVectors;
+  if (IgnoreScalableUserVF) {
+    LLVM_DEBUG(
+        dbgs() << "LV: Ignoring VF=" << UserVF
+               << " because target does not support scalable vectors.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
+                                        TheLoop->getStartLoc(),
+                                        TheLoop->getHeader())
+             << "Ignoring VF=" << ore::NV("UserVF", UserVF)
+             << " because target does not support scalable vectors.";
+    });
+  }
 
-  // Convenience function to return the minimum of two ElementCounts.
-  auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
-    assert((LHS.isScalable() == RHS.isScalable()) &&
-           "Scalable flags must match");
-    return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
-  };
+  // Beyond this point two scenarios are handled. If UserVF isn't specified
+  // then a suitable VF is chosen. If UserVF is specified and there are
+  // dependencies, check if it's legal. However, if a UserVF is specified and
+  // there are no dependencies, then there's nothing to do.
+  if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
+    if (!canVectorizeReductions(UserVF)) {
+      reportVectorizationFailure(
+          "LV: Scalable vectorization not supported for the reduction "
+          "operations found in this loop. Using fixed-width "
+          "vectorization instead.",
+          "Scalable vectorization not supported for the reduction operations "
+          "found in this loop. Using fixed-width vectorization instead.",
+          "ScalableVFUnfeasible", ORE, TheLoop);
+      return computeFeasibleMaxVF(
+          ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
+    }
+
+    if (Legal->isSafeForAnyVectorWidth())
+      return UserVF;
+  }
+
+  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
+  unsigned SmallestType, WidestType;
+  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
+  unsigned WidestRegister =
+      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+          .getFixedSize();
+
+  // Get the maximum safe dependence distance in bits computed by LAA.
+  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
+  // the memory accesses that is most restrictive (involved in the smallest
+  // dependence distance).
+  unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
+
+  // If the user vectorization factor is legally unsafe, clamp it to a safe
+  // value. Otherwise, return as is.
+  if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
+    unsigned MaxSafeElements =
+        PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
+    ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
+
+    if (UserVF.isScalable()) {
+      Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+
+      // Scale VF by vscale before checking if it's safe.
+      MaxSafeVF = ElementCount::getScalable(
+          MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
+
+      if (MaxSafeVF.isZero()) {
+        // The dependence distance is too small to use scalable vectors,
+        // fallback on fixed.
+        LLVM_DEBUG(
+            dbgs()
+            << "LV: Max legal vector width too small, scalable vectorization "
+               "unfeasible. Using fixed-width vectorization instead.\n");
+        ORE->emit([&]() {
+          return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
+                                            TheLoop->getStartLoc(),
+                                            TheLoop->getHeader())
+                 << "Max legal vector width too small, scalable vectorization "
+                 << "unfeasible. Using fixed-width vectorization instead.";
+        });
+        return computeFeasibleMaxVF(
+            ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
+
+    if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
+      return UserVF;
+
+    LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+                      << " is unsafe, clamping to max safe VF=" << MaxSafeVF
+                      << ".\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+                                        TheLoop->getStartLoc(),
+                                        TheLoop->getHeader())
+             << "User-specified vectorization factor "
+             << ore::NV("UserVectorizationFactor", UserVF)
+             << " is unsafe, clamping to maximum safe vectorization factor "
+             << ore::NV("VectorizationFactor", MaxSafeVF);
+    });
+    return MaxSafeVF;
+  }
+
+  WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
 
   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
   // Note that both WidestRegister and WidestType may not be a powers of 2.
-  auto MaxVectorElementCount = ElementCount::get(
-      PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
-      ComputeScalableMaxVF);
-  MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
+  auto MaxVectorSize =
+      ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType));
+
+  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
+                    << " / " << WidestType << " bits.\n");
   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
-                    << (MaxVectorElementCount * WidestType) << " bits.\n");
+                    << WidestRegister << " bits.\n");
 
-  if (!MaxVectorElementCount) {
+  assert(MaxVectorSize.getFixedValue() <= WidestRegister &&
+         "Did not expect to pack so many elements"
+         " into one vector!");
+  if (MaxVectorSize.getFixedValue() == 0) {
     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
     return ElementCount::getFixed(1);
-  }
-
-  const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
-  if (ConstTripCount &&
-      ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
-      isPowerOf2_32(ConstTripCount)) {
+  } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() &&
+             isPowerOf2_32(ConstTripCount)) {
     // We need to clamp the VF to be the ConstTripCount. There is no point in
-    // choosing a higher viable VF as done in the loop below. If
-    // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
-    // the TC is less than or equal to the known number of lanes.
+    // choosing a higher viable VF as done in the loop below.
     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
                       << ConstTripCount << "\n");
-    return TripCountEC;
+    return ElementCount::getFixed(ConstTripCount);
   }
 
-  ElementCount MaxVF = MaxVectorElementCount;
+  ElementCount MaxVF = MaxVectorSize;
   if (TTI.shouldMaximizeVectorBandwidth() ||
       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
-    auto MaxVectorElementCountMaxBW = ElementCount::get(
-        PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
-        ComputeScalableMaxVF);
-    MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
-
     // Collect all viable vectorization factors larger than the default MaxVF
-    // (i.e. MaxVectorElementCount).
+    // (i.e. MaxVectorSize).
     SmallVector<ElementCount, 8> VFs;
-    for (ElementCount VS = MaxVectorElementCount * 2;
-         ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
+    auto MaxVectorSizeMaxBW =
+        ElementCount::getFixed(WidestRegister / SmallestType);
+    for (ElementCount VS = MaxVectorSize * 2;
+         ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2)
       VFs.push_back(VS);
 
     // For each VF calculate its register usage.
@@ -5936,7 +5879,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
       }
     }
     if (ElementCount MinVF =
-            TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
+            TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) {
       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
                           << ") with target's minimum: " << MinVF << '\n');

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index 7b410dd73a3c1..582bf4dc5747c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -221,7 +221,7 @@ for.end:
   ret float %add
 }
 
-; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
+; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead.
 ; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
 define bfloat @fadd_fast_bfloat(bfloat* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-LABEL: @fadd_fast_bfloat
@@ -322,18 +322,18 @@ for.end:
 
 ; MUL
 
-; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
-; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
+; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead.
+; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
 define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
 ; CHECK-LABEL: @mul
 ; CHECK: vector.body:
-; CHECK: %[[LOAD1:.*]] = load <4 x i32>
-; CHECK: %[[LOAD2:.*]] = load <4 x i32>
-; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]]
-; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]]
+; CHECK: %[[LOAD1:.*]] = load <8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <8 x i32>
+; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD1]]
+; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD2]]
 ; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
+; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]])
 entry:
   br label %for.body
 
@@ -352,22 +352,22 @@ for.end:                                 ; preds = %for.body, %entry
 }
 
 ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies
-; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
-; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
+; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead.
+; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
 define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
 ; CHECK-LABEL: @memory_dependence
 ; CHECK: vector.body:
-; CHECK: %[[LOAD1:.*]] = load <4 x i32>
-; CHECK: %[[LOAD2:.*]] = load <4 x i32>
-; CHECK: %[[LOAD3:.*]] = load <4 x i32>
-; CHECK: %[[LOAD4:.*]] = load <4 x i32>
-; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
-; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
-; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
-; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
+; CHECK: %[[LOAD1:.*]] = load <8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <8 x i32>
+; CHECK: %[[LOAD3:.*]] = load <8 x i32>
+; CHECK: %[[LOAD4:.*]] = load <8 x i32>
+; CHECK: %[[ADD1:.*]] = add nsw <8 x i32> %[[LOAD3]], %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add nsw <8 x i32> %[[LOAD4]], %[[LOAD2]]
+; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD3]]
+; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD4]]
 ; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
+; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]])
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll
deleted file mode 100644
index 6fe546439a133..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll
+++ /dev/null
@@ -1,149 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON
-; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON_MAXBW
-
-; Test that the MaxVF for the following loop, that has no dependence distances,
-; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16
-; (maximized bandwidth for i8 in the loop).
-define void @test0(i32* %a, i8* %b, i32* %c) {
-; CHECK: LV: Checking a loop in "test0"
-; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
-  %1 = load i8, i8* %arrayidx2, align 4
-  %zext = zext i8 %1 to i32
-  %add = add nsw i32 %zext, %0
-  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv
-  store i32 %add, i32* %arrayidx5, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 1024
-  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
-
-exit:
-  ret void
-}
-
-; Test that the MaxVF for the following loop, with a dependence distance
-; of 64 elements, is calculated as (maxvscale = 16) * 4.
-define void @test1(i32* %a, i8* %b) {
-; CHECK: LV: Checking a loop in "test1"
-; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
-  %1 = load i8, i8* %arrayidx2, align 4
-  %zext = zext i8 %1 to i32
-  %add = add nsw i32 %zext, %0
-  %2 = add nuw nsw i64 %iv, 64
-  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
-  store i32 %add, i32* %arrayidx5, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 1024
-  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
-
-exit:
-  ret void
-}
-
-; Test that the MaxVF for the following loop, with a dependence distance
-; of 32 elements, is calculated as (maxvscale = 16) * 2.
-define void @test2(i32* %a, i8* %b) {
-; CHECK: LV: Checking a loop in "test2"
-; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
-; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
-  %1 = load i8, i8* %arrayidx2, align 4
-  %zext = zext i8 %1 to i32
-  %add = add nsw i32 %zext, %0
-  %2 = add nuw nsw i64 %iv, 32
-  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
-  store i32 %add, i32* %arrayidx5, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 1024
-  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
-
-exit:
-  ret void
-}
-
-; Test that the MaxVF for the following loop, with a dependence distance
-; of 16 elements, is calculated as (maxvscale = 16) * 1.
-define void @test3(i32* %a, i8* %b) {
-; CHECK: LV: Checking a loop in "test3"
-; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
-; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
-  %1 = load i8, i8* %arrayidx2, align 4
-  %zext = zext i8 %1 to i32
-  %add = add nsw i32 %zext, %0
-  %2 = add nuw nsw i64 %iv, 16
-  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
-  store i32 %add, i32* %arrayidx5, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 1024
-  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
-
-exit:
-  ret void
-}
-
-; Test the fallback mechanism when scalable vectors are not feasible due
-; to e.g. dependence distance. For the '-scalable-vectorization=exclusive'
-; it shouldn't try to vectorize with fixed-width vectors.
-define void @test4(i32* %a, i32* %b) {
-; CHECK: LV: Checking a loop in "test4"
-; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_ON_MAXBW-NOT: LV: Found feasible scalable VF
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
-  %1 = load i32, i32* %arrayidx2, align 4
-  %add = add nsw i32 %1, %0
-  %2 = add nuw nsw i64 %iv, 8
-  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
-  store i32 %add, i32* %arrayidx5, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 1024
-  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !2
-
-exit:
-  ret void
-}
-
-!0 = distinct !{!0, !1}
-!1 = !{!"llvm.loop.vectorize.enable", i1 true}
-!2 = distinct !{!2, !3, !4}
-!3 = !{!"llvm.loop.vectorize.enable", i1 true}
-!4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
index 33d75e0a8565c..a6b50e03768c6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
@@ -37,10 +37,9 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 ; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check
 ; fixed-width vectorization is used instead.
 
-; CHECK-DBG: LV: Checking a loop in "test1"
-; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible.
-; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible.
-; CHECK-DBG: LV: The max safe fixed VF is: 8.
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: LV: The max safe VF is: 8.
 ; CHECK-DBG: LV: Selecting VF: 4.
 ; CHECK-LABEL: @test1
 ; CHECK: <4 x i32>
@@ -81,10 +80,9 @@ exit:
 ;   }
 ; }
 
-; CHECK-DBG: LV: Checking a loop in "test2"
-; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible.
-; CHECK-DBG: LV: The max safe fixed VF is: 4.
-; CHECK-DBG: LV: User VF=vscale x 8 is unsafe. Ignoring scalable UserVF.
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: LV: The max safe VF is: 4.
+; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4.
 ; CHECK-DBG: LV: Selecting VF: 4.
 ; CHECK-LABEL: @test2
 ; CHECK: <4 x i32>
@@ -131,7 +129,7 @@ exit:
 ; Max fixed VF=32, Max scalable VF=2, safe to vectorize.
 
 ; CHECK-DBG-LABEL: LV: Checking a loop in "test3"
-; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2.
+; CHECK-DBG: LV: The max safe VF is: vscale x 2.
 ; CHECK-DBG: LV: Using user VF vscale x 2.
 ; CHECK-LABEL: @test3
 ; CHECK: <vscale x 2 x i32>
@@ -163,8 +161,7 @@ exit:
 
 ; test4
 ;
-; Scalable vectorization feasible, but the given VF is unsafe. Should ignore
-; the hint and leave it to the vectorizer to pick a more suitable VF.
+; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
 ;
 ; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
 ; words per 128-bits (packed).
@@ -176,16 +173,15 @@ exit:
 ;   }
 ; }
 ;
-; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize.
+; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. Should clamp to 2.
 
 ; CHECK-DBG-LABEL: LV: Checking a loop in "test4"
-; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2.
-; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
-; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a suitable VF.
-; CHECK-DBG: Found feasible scalable VF = vscale x 2
-; CHECK-DBG: LV: Selecting VF: 4.
+; CHECK-DBG: LV: The max safe VF is: vscale x 2.
+; CHECK-DBG: LV: User VF=vscale x 4 is unsafe, clamping to max safe VF=vscale x 2.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2
+; CHECK-DBG: LV: Using max VF vscale x 2
 ; CHECK-LABEL: @test4
-; CHECK: <4 x i32>
+; CHECK: <vscale x 2 x i32>
 define void @test4(i32* %a, i32* %b) {
 entry:
   br label %loop
@@ -229,7 +225,7 @@ exit:
 ; Max fixed VF=128, Max scalable VF=8, safe to vectorize.
 
 ; CHECK-DBG-LABEL: LV: Checking a loop in "test5"
-; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8.
+; CHECK-DBG: LV: The max safe VF is: vscale x 8.
 ; CHECK-DBG: LV: Using user VF vscale x 4
 ; CHECK-LABEL: @test5
 ; CHECK: <vscale x 4 x i32>
@@ -261,8 +257,7 @@ exit:
 
 ; test6
 ;
-; Scalable vectorization feasible, but the VF is unsafe. Should ignore
-; the hint and leave it to the vectorizer to pick a more suitable VF.
+; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
 ;
 ; Specifies a vector of <vscale x 16 x i32>, i.e. maximum of 256 x i32.
 ;
@@ -273,16 +268,15 @@ exit:
 ;   }
 ; }
 ;
-; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize.
+; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. Should clamp to 8.
 
 ; CHECK-DBG-LABEL: LV: Checking a loop in "test6"
-; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8.
-; CHECK-DBG: LV: User VF=vscale x 16 is unsafe. Ignoring scalable UserVF.
-; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe. Ignoring the hint to let the compiler pick a suitable VF.
-; CHECK-DBG: LV: Found feasible scalable VF = vscale x 4
-; CHECK-DBG: Selecting VF: 4.
+; CHECK-DBG: LV: The max safe VF is: vscale x 8.
+; CHECK-DBG: LV: User VF=vscale x 16 is unsafe, clamping to max safe VF=vscale x 8.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8
+; CHECK-DBG: LV: Using max VF vscale x 8
 ; CHECK-LABEL: @test6
-; CHECK: <4 x i32>
+; CHECK: <vscale x 8 x i32>
 define void @test6(i32* %a, i32* %b) {
 entry:
   br label %loop
@@ -310,9 +304,8 @@ exit:
 !17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
 
 ; CHECK-NO-SVE-LABEL: LV: Checking a loop in "test_no_sve"
-; CHECK-NO-SVE: LV: Disabling scalable vectorization, because target does not support scalable vectors.
-; CHECK-NO-SVE: remark: <unknown>:0:0: Disabling scalable vectorization, because target does not support scalable vectors.
-; CHECK-NO-SVE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
+; CHECK-NO-SVE: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK-NO-SVE: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
 ; CHECK-NO-SVE: LV: Selecting VF: 4.
 ; CHECK-NO-SVE: <4 x i32>
 ; CHECK-NO-SVE-NOT: <vscale x 4 x i32>
@@ -344,8 +337,8 @@ exit:
 ; supported but max vscale is undefined.
 ;
 ; CHECK-NO-MAX-VSCALE-LABEL: LV: Checking a loop in "test_no_max_vscale"
-; CEHCK-NO-MAX-VSCALE: The max safe fixed VF is: 4.
-; CHECK-NO-MAX-VSCALE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
+; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CEHCK-NO-MAX-VSCALE: The max safe VF is: 4.
 ; CHECK-NO-MAX-VSCALE: LV: Selecting VF: 4.
 ; CHECK-NO-MAX-VSCALE: <4 x i32>
 define void @test_no_max_vscale(i32* %a, i32* %b) {

diff  --git a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
index f99d87f8ad795..1361ba59bca2d 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
@@ -3,8 +3,8 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-; CHECK: LV: Disabling scalable vectorization, because target does not support scalable vectors.
-; CHECK: remark: <unknown>:0:0: Disabling scalable vectorization, because target does not support scalable vectors.
+; CHECK: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
 ; CHECK: LV: The Widest register safe to use is: 32 bits.
 define void @test1(i32* %a, i32* %b) {
 entry: