[llvm] 1e7efd3 - [LV] Legalize scalable VF hints

Fri Jan 8 03:03:38 PST 2021

Author: Cullen Rhodes
Date: 2021-01-08T10:49:44Z
New Revision: 1e7efd397ac259429df9f212c9ec16bec8ff2533

URL: https://github.com/llvm/llvm-project/commit/1e7efd397ac259429df9f212c9ec16bec8ff2533
DIFF: https://github.com/llvm/llvm-project/commit/1e7efd397ac259429df9f212c9ec16bec8ff2533.diff

LOG: [LV] Legalize scalable VF hints

In the following loop:

  void foo(int *a, int *b, int N) {
    for (int i=0; i<N; ++i)
      a[i + 4] = a[i] + b[i];
  }

The loop dependence constrains the VF to a maximum of (4, fixed), which
would mean using <4 x i32> as the vector type in vectorization.
Extending this to scalable vectorization, a VF of (4, scalable) implies
a vector type of <vscale x 4 x i32>. To determine if this is legal
vscale must be taken into account. For this example, unless
max(vscale)=1, it's unsafe to vectorize.

For SVE, the number of bits in an SVE register is architecturally
defined to be a multiple of 128 bits with a maximum of 2048 bits, thus
the maximum vscale is 16. In the loop above it is therefore unfeasible
to vectorize with SVE. However, in this loop:

  void foo(int *a, int *b, int N) {
    #pragma clang loop vectorize_width(X, scalable)
    for (int i=0; i<N; ++i)
      a[i + 32] = a[i] + b[i];
  }

As long as max(vscale) multiplied by the number of lanes 'X' doesn't
exceed the dependence distance, it is safe to vectorize. For SVE a VF of
(2, scalable) is within this constraint, since a vector of <16 x 2 x 32>
will have no dependencies between lanes. For any number of lanes larger
than this it would be unsafe to vectorize.

This patch extends 'computeFeasibleMaxVF' to legalize scalable VFs
specified as loop hints, implementing the following behaviour:
  * If the backend does not support scalable vectors, ignore the hint.
  * If scalable vectorization is unfeasible given the loop
    dependence, like in the first example above for SVE, then use a
    fixed VF.
  * Accept scalable VFs if it's safe to do so.
  * Otherwise, clamp scalable VFs that exceed the maximum safe VF.

Reviewed By: sdesmalen, fhahn, david-arm

Differential Revision: https://reviews.llvm.org/D91718

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
    llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
    llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll

Modified: 
    llvm/include/llvm/Analysis/LoopAccessAnalysis.h
    llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/metadata-width.ll
    llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll
    llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 5ee98f308878..13fbe884eddf 100644

--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -205,6 +205,12 @@ class MemoryDepChecker {
     return Status == VectorizationSafetyStatus::Safe;
   }
 
+  /// Return true if the number of elements that are safe to operate on
+  /// simultaneously is not bounded.
+  bool isSafeForAnyVectorWidth() const {
+    return MaxSafeVectorWidthInBits == UINT_MAX;
+  }
+
   /// The maximum number of bytes of a vector register we can vectorize
   /// the accesses safely with.
   uint64_t getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }

diff  --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index f701e08961a0..7e5a723991d4 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -325,6 +325,10 @@ class LoopVectorizationLegality {
 
   const LoopAccessInfo *getLAI() const { return LAI; }
 
+  bool isSafeForAnyVectorWidth() const {
+    return LAI->getDepChecker().isSafeForAnyVectorWidth();
+  }
+
   unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
 
   uint64_t getMaxSafeVectorWidthInBits() const {

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0b58ad1ff2ea..21d614934792 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -272,6 +272,12 @@ static cl::opt<unsigned> ForceTargetInstructionCost(
              "an instruction to a single constant value. Mostly "
              "useful for getting consistent testing."));
 
+static cl::opt<bool> ForceTargetSupportsScalableVectors(
+    "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Pretend that scalable vectors are supported, even if the target does "
+        "not support them. This flag should only be used for testing."));
+
 static cl::opt<unsigned> SmallLoopCost(
     "small-loop-cost", cl::init(20), cl::Hidden,
     cl::desc(
@@ -5592,6 +5598,30 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 ElementCount
 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
                                                  ElementCount UserVF) {
+  bool IgnoreScalableUserVF = UserVF.isScalable() &&
+                              !TTI.supportsScalableVectors() &&
+                              !ForceTargetSupportsScalableVectors;
+  if (IgnoreScalableUserVF) {
+    LLVM_DEBUG(
+        dbgs() << "LV: Ignoring VF=" << UserVF
+               << " because target does not support scalable vectors.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
+                                        TheLoop->getStartLoc(),
+                                        TheLoop->getHeader())
+             << "Ignoring VF=" << ore::NV("UserVF", UserVF)
+             << " because target does not support scalable vectors.";
+    });
+  }
+
+  // Beyond this point two scenarios are handled. If UserVF isn't specified
+  // then a suitable VF is chosen. If UserVF is specified and there are
+  // dependencies, check if it's legal. However, if a UserVF is specified and
+  // there are no dependencies, then there's nothing to do.
+  if (UserVF.isNonZero() && !IgnoreScalableUserVF &&
+      Legal->isSafeForAnyVectorWidth())
+    return UserVF;
+
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5603,15 +5633,42 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
   // dependence distance).
   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
 
-  if (UserVF.isNonZero()) {
-    // For now, don't verify legality of scalable vectors.
-    // This will be addressed properly in https://reviews.llvm.org/D91718.
-    if (UserVF.isScalable())
-      return UserVF;
+  // If the user vectorization factor is legally unsafe, clamp it to a safe
+  // value. Otherwise, return as is.
+  if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
+    unsigned MaxSafeElements =
+        PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
+    ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
+
+    if (UserVF.isScalable()) {
+      Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+
+      // Scale VF by vscale before checking if it's safe.
+      MaxSafeVF = ElementCount::getScalable(
+          MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
+
+      if (MaxSafeVF.isZero()) {
+        // The dependence distance is too small to use scalable vectors,
+        // fallback on fixed.
+        LLVM_DEBUG(
+            dbgs()
+            << "LV: Max legal vector width too small, scalable vectorization "
+               "unfeasible. Using fixed-width vectorization instead.\n");
+        ORE->emit([&]() {
+          return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
+                                            TheLoop->getStartLoc(),
+                                            TheLoop->getHeader())
+                 << "Max legal vector width too small, scalable vectorization "
+                 << "unfeasible. Using fixed-width vectorization instead.";
+        });
+        return computeFeasibleMaxVF(
+            ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
+      }
+    }
 
-    // If legally unsafe, clamp the user vectorization factor to a safe value.
-    unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
-    if (UserVF.getFixedValue() <= MaxSafeVF)
+    LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
+
+    if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
       return UserVF;
 
     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
@@ -5626,7 +5683,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
              << " is unsafe, clamping to maximum safe vectorization factor "
              << ore::NV("VectorizationFactor", MaxSafeVF);
     });
-    return ElementCount::getFixed(MaxSafeVF);
+    return MaxSafeVF;
   }
 
   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
@@ -7426,17 +7483,24 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   ElementCount MaxVF = MaybeMaxVF.getValue();
   assert(MaxVF.isNonZero() && "MaxVF is zero.");
 
-  if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) {
-    LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-    assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
+  bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
+  if (!UserVF.isZero() &&
+      (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
+    // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
+    // VFs here, this should be reverted to only use legal UserVFs once the
+    // loop below supports scalable VFs.
+    ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
+    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
+                      << " VF " << VF << ".\n");
+    assert(isPowerOf2_32(VF.getKnownMinValue()) &&
            "VF needs to be a power of two");
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
-    CM.selectUserVectorizationFactor(UserVF);
+    CM.selectUserVectorizationFactor(VF);
     CM.collectInLoopReductions();
-    buildVPlansWithVPRecipes(UserVF, UserVF);
+    buildVPlansWithVPRecipes(VF, VF);
     LLVM_DEBUG(printPlans(dbgs()));
-    return {{UserVF, 0}};
+    return {{VF, 0}};
   }
 
   assert(!MaxVF.isScalable() &&

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
new file mode 100644
index 000000000000..a6b50e03768c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
@@ -0,0 +1,368 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -S < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-DBG %s
+; RUN: opt -mtriple=aarch64-none-linux-gnu -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-NO-SVE %s
+; RUN: opt -mtriple=aarch64-none-linux-gnu -loop-vectorize -force-target-supports-scalable-vectors=true -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-NO-MAX-VSCALE %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; These tests validate the behaviour of scalable vectorization factor hints,
+; where the following applies:
+;
+; * If the backend does not support scalable vectors, ignore the hint and let
+;   the vectorizer pick a VF.
+; * If there are no dependencies and assuming the VF is a power of 2 the VF
+;   should be accepted. This applies to both fixed and scalable VFs.
+; * If the dependency is too small to use scalable vectors, change the VF to
+;   fixed, where existing behavior applies (clamping).
+; * If scalable vectorization is feasible given the dependency and the VF is
+;   valid, accept it. Otherwise, clamp to the max scalable VF.
+
+; test1
+;
+; Scalable vectorization unfeasible, clamp VF from (4, scalable) -> (4, fixed).
+;
+; The pragma applied to this loop implies a scalable vector <vscale x 4 x i32>
+; be used for vectorization. For fixed vectors the MaxVF=8, otherwise there
+; would be a dependence between vector lanes for vectors greater than 256 bits.
+;
+; void test1(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 8] = a[i] + b[i];
+;   }
+; }
+;
+; For scalable vectorization 'vscale' has to be considered, for this example
+; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check
+; fixed-width vectorization is used instead.
+
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: LV: The max safe VF is: 8.
+; CHECK-DBG: LV: Selecting VF: 4.
+; CHECK-LABEL: @test1
+; CHECK: <4 x i32>
+define void @test1(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test2
+;
+; Scalable vectorization unfeasible, clamp VF from (8, scalable) -> (4, fixed).
+;
+; void test2(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(8, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 4] = a[i] + b[i];
+;   }
+; }
+
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: LV: The max safe VF is: 4.
+; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4.
+; CHECK-DBG: LV: Selecting VF: 4.
+; CHECK-LABEL: @test2
+; CHECK: <4 x i32>
+define void @test2(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !3
+
+exit:
+  ret void
+}
+
+!3 = !{!3, !4, !5}
+!4 = !{!"llvm.loop.vectorize.width", i32 8}
+!5 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test3
+;
+; Scalable vectorization feasible and the VF is valid.
+;
+; Specifies a vector of <vscale x 2 x i32>, i.e. maximum of 32 x i32 with 2
+; words per 128-bits (unpacked).
+;
+; void test3(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(2, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 32] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=32, Max scalable VF=2, safe to vectorize.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test3"
+; CHECK-DBG: LV: The max safe VF is: vscale x 2.
+; CHECK-DBG: LV: Using user VF vscale x 2.
+; CHECK-LABEL: @test3
+; CHECK: <vscale x 2 x i32>
+define void @test3(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 32
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !6
+
+exit:
+  ret void
+}
+
+!6 = !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 2}
+!8 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test4
+;
+; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
+;
+; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
+; words per 128-bits (packed).
+;
+; void test4(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 32] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. Should clamp to 2.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test4"
+; CHECK-DBG: LV: The max safe VF is: vscale x 2.
+; CHECK-DBG: LV: User VF=vscale x 4 is unsafe, clamping to max safe VF=vscale x 2.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2
+; CHECK-DBG: LV: Using max VF vscale x 2
+; CHECK-LABEL: @test4
+; CHECK: <vscale x 2 x i32>
+define void @test4(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 32
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !9
+
+exit:
+  ret void
+}
+
+!9 = !{!9, !10, !11}
+!10 = !{!"llvm.loop.vectorize.width", i32 4}
+!11 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test5
+;
+; Scalable vectorization feasible and the VF is valid.
+;
+; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
+; words per 128-bits (packed).
+;
+; void test5(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 128] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=128, Max scalable VF=8, safe to vectorize.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test5"
+; CHECK-DBG: LV: The max safe VF is: vscale x 8.
+; CHECK-DBG: LV: Using user VF vscale x 4
+; CHECK-LABEL: @test5
+; CHECK: <vscale x 4 x i32>
+define void @test5(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 128
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !12
+
+exit:
+  ret void
+}
+
+!12 = !{!12, !13, !14}
+!13 = !{!"llvm.loop.vectorize.width", i32 4}
+!14 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test6
+;
+; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
+;
+; Specifies a vector of <vscale x 16 x i32>, i.e. maximum of 256 x i32.
+;
+; void test6(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(16, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 128] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. Should clamp to 8.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test6"
+; CHECK-DBG: LV: The max safe VF is: vscale x 8.
+; CHECK-DBG: LV: User VF=vscale x 16 is unsafe, clamping to max safe VF=vscale x 8.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8
+; CHECK-DBG: LV: Using max VF vscale x 8
+; CHECK-LABEL: @test6
+; CHECK: <vscale x 8 x i32>
+define void @test6(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 128
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !15
+
+exit:
+  ret void
+}
+
+!15 = !{!15, !16, !17}
+!16 = !{!"llvm.loop.vectorize.width", i32 16}
+!17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; CHECK-NO-SVE-LABEL: LV: Checking a loop in "test_no_sve"
+; CHECK-NO-SVE: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK-NO-SVE: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK-NO-SVE: LV: Selecting VF: 4.
+; CHECK-NO-SVE: <4 x i32>
+; CHECK-NO-SVE-NOT: <vscale x 4 x i32>
+define void @test_no_sve(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  store i32 %add, i32* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !18
+
+exit:
+  ret void
+}
+
+!18 = !{!18, !19, !20}
+!19 = !{!"llvm.loop.vectorize.width", i32 4}
+!20 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; Test the LV falls back to fixed-width vectorization if scalable vectors are
+; supported but max vscale is undefined.
+;
+; CHECK-NO-MAX-VSCALE-LABEL: LV: Checking a loop in "test_no_max_vscale"
+; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CEHCK-NO-MAX-VSCALE: The max safe VF is: 4.
+; CHECK-NO-MAX-VSCALE: LV: Selecting VF: 4.
+; CHECK-NO-MAX-VSCALE: <4 x i32>
+define void @test_no_max_vscale(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !21
+
+exit:
+  ret void
+}
+
+!21 = !{!21, !22, !23}
+!22 = !{!"llvm.loop.vectorize.width", i32 4}
+!23 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}

diff  --git a/llvm/test/Transforms/LoopVectorize/metadata-width.ll b/llvm/test/Transforms/LoopVectorize/metadata-width.ll
index e107de672c8b..1dc51cac3b37 100644
--- a/llvm/test/Transforms/LoopVectorize/metadata-width.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata-width.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-target-supports-scalable-vectors=true -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 

diff  --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll
index 315b1fed31ab..c4a8f0ded86b 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll
@@ -99,27 +99,3 @@ for.end.loopexit:                                 ; preds = %for.body
 for.end:                                          ; preds = %for.end.loopexit, %entry
   ret void
 }
-
-; Currently we cannot handle scalable vectorization factors.
-; CHECK: LV: Checking a loop in "f4"
-; CHECK: LEV: Epilogue vectorization for scalable vectors not yet supported.
-
-define void @f4(i8* %A) {
-entry:
-  br label %for.body
-
-for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv
-  store i8 1, i8* %arrayidx, align 1
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp ne i64 %iv.next, 1024
-  br i1 %exitcond, label %for.body, label %exit, !llvm.loop !0
-
-exit:
-  ret void
-}
-
-!0 = !{!0, !1, !2}
-!1 = !{!"llvm.loop.vectorize.width", i32 4}
-!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}

diff  --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
new file mode 100644
index 000000000000..0a6925d9b096
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
@@ -0,0 +1,27 @@
+; REQUIRES: asserts
+; RUN: opt < %s  -passes='loop-vectorize' -force-vector-width=2 -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
+
+; Currently we cannot handle scalable vectorization factors.
+; CHECK: LV: Checking a loop in "f1"
+; CHECK: LEV: Epilogue vectorization for scalable vectors not yet supported.
+
+define void @f1(i8* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv
+  store i8 1, i8* %arrayidx, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, 1024
+  br i1 %exitcond, label %for.body, label %exit, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}

diff  --git a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
index aeda1800e21f..3d845e659e47 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 < %s | FileCheck %s --check-prefix=CHECKUF1
-; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 < %s | FileCheck %s --check-prefix=CHECKUF2
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 -force-vector-width=4 -force-target-supports-scalable-vectors=true < %s | FileCheck %s --check-prefix=CHECKUF1
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 -force-vector-width=4 -force-target-supports-scalable-vectors=true < %s | FileCheck %s --check-prefix=CHECKUF2
 
 ; CHECKUF1: for.body.preheader:
 ; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64
@@ -96,6 +96,5 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
 }
 
-!1 = distinct !{!1, !2, !3}
-!2 = !{!"llvm.loop.vectorize.width", i32 4}
-!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}

diff  --git a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
new file mode 100644
index 000000000000..1361ba59bca2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
@@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK: LV: The Widest register safe to use is: 32 bits.
+define void @test1(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}