[llvm-branch-commits] [llvm] cba4acc - [LV] Clamp VF hint when unsafe

Tue Dec 1 03:34:44 PST 2020

Author: Cullen Rhodes
Date: 2020-12-01T11:30:34Z
New Revision: cba4accda08f90bbc96d7662ef6b1bb12a7733f2

URL: https://github.com/llvm/llvm-project/commit/cba4accda08f90bbc96d7662ef6b1bb12a7733f2
DIFF: https://github.com/llvm/llvm-project/commit/cba4accda08f90bbc96d7662ef6b1bb12a7733f2.diff

LOG: [LV] Clamp VF hint when unsafe

In the following loop the dependence distance is 2 and can only be
vectorized if the vector length is no larger than this.

  void foo(int *a, int *b, int N) {
    #pragma clang loop vectorize(enable) vectorize_width(4)
    for (int i=0; i<N; ++i) {
      a[i + 2] = a[i] + b[i];
    }
  }

However, when specifying a VF of 4 via a loop hint this loop is
vectorized. According to [1][2], loop hints are ignored if the
optimization is not safe to apply.

This patch introduces a check to bail of vectorization if the user
specified VF is greater than the maximum feasible VF, unless explicitly
forced with '-force-vector-width=X'.

[1] https://llvm.org/docs/LangRef.html#llvm-loop-vectorize-and-llvm-loop-interleave
[2] https://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations

Reviewed By: sdesmalen, fhahn, Meinersbur

Differential Revision: https://reviews.llvm.org/D90687

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
    llvm/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d389e03e9c04..d87938bb1464 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1443,7 +1443,8 @@ class LoopVectorizationCostModel {
   /// \return An upper bound for the vectorization factor, a power-of-2 larger
   /// than zero. One is returned if vectorization should best be avoided due
   /// to cost.
-  ElementCount computeFeasibleMaxVF(unsigned ConstTripCount);
+  ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
+                                    ElementCount UserVF);
 
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
@@ -5270,9 +5271,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return None;
   }
 
+  ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
+
   switch (ScalarEpilogueStatus) {
   case CM_ScalarEpilogueAllowed:
-    return UserVF ? UserVF : computeFeasibleMaxVF(TC);
+    return MaxVF;
   case CM_ScalarEpilogueNotNeededUsePredicate:
     LLVM_DEBUG(
         dbgs() << "LV: vector predicate hint/switch found.\n"
@@ -5308,7 +5311,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
   }
 
-  ElementCount MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
   assert(!MaxVF.isScalable() &&
          "Scalable vectors do not yet support tail folding");
   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
@@ -5361,7 +5363,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 }
 
 ElementCount
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
+                                                 ElementCount UserVF) {
+  assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5373,6 +5377,27 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
   // dependence distance).
   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
 
+  if (UserVF.isNonZero()) {
+    // If legally unsafe, clamp the user vectorization factor to a safe value.
+    unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
+    if (UserVF.getFixedValue() <= MaxSafeVF)
+      return UserVF;
+
+    LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+                      << " is unsafe, clamping to max safe VF=" << MaxSafeVF
+                      << ".\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+                                        TheLoop->getStartLoc(),
+                                        TheLoop->getHeader())
+             << "User-specified vectorization factor "
+             << ore::NV("UserVectorizationFactor", UserVF)
+             << " is unsafe, clamping to maximum safe vectorization factor "
+             << ore::NV("VectorizationFactor", MaxSafeVF);
+    });
+    return ElementCount::getFixed(MaxSafeVF);
+  }
+
   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
 
   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
@@ -7031,9 +7056,12 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.invalidateCostModelingDecisions();
   }
 
-  if (!UserVF.isZero()) {
+  ElementCount MaxVF = MaybeMaxVF.getValue();
+  assert(MaxVF.isNonZero() && "MaxVF is zero.");
+
+  if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) {
     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-    assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
+    assert(isPowerOf2_32(UserVF.getFixedValue()) &&
            "VF needs to be a power of two");
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
@@ -7044,9 +7072,6 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
     return {{UserVF, 0}};
   }
 
-  ElementCount MaxVF = MaybeMaxVF.getValue();
-  assert(MaxVF.isNonZero() && "MaxVF is zero.");
-
   for (ElementCount VF = ElementCount::getFixed(1);
        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
     // Collect Uniform and Scalar instructions after vectorization with VF.

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
new file mode 100644
index 000000000000..44293441e492
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
@@ -0,0 +1,43 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Specify a large unsafe vectorization factor of 32 that gets clamped to 16,
+; then test an even smaller VF of 2 is selected based on the cost-model.
+
+; CHECK: LV: User VF=32 is unsafe, clamping to max safe VF=16.
+; CHECK: remark: <unknown>:0:0: User-specified vectorization factor 32 is unsafe, clamping to maximum safe vectorization factor 16
+; CHECK: LV: Selecting VF: 2.
+; CHECK-LABEL: @test
+; CHECK: <2 x i64>
+define void @test(i64* nocapture %a, i64* nocapture readonly %b) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %iv
+  %0 = load i64, i64* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i64, i64* %b, i64 %iv
+  %1 = load i64, i64* %arrayidx2, align 4
+  %add = add nsw i64 %1, %0
+  %2 = add nuw nsw i64 %iv, 16
+  %arrayidx5 = getelementptr inbounds i64, i64* %a, i64 %2
+  %c = icmp eq i64 %1, 120
+  br i1 %c, label %then, label %latch
+
+then:
+  store i64 %add, i64* %arrayidx5, align 4
+  br label %latch
+
+latch:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i64 32}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}

diff  --git a/llvm/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll
new file mode 100644
index 000000000000..a6b0eec230bd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll
@@ -0,0 +1,46 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Make sure the unsafe user specified vectorization factor is clamped.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; void foo(int *a, int *b) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4)
+;   for (int i=0; i < 1024; ++i) {
+;     a[i + 2] = a[i] + b[i];
+;   }
+; }
+
+; CHECK: LV: User VF=4 is unsafe, clamping to max safe VF=2.
+; CHECK: remark: <unknown>:0:0: User-specified vectorization factor 4 is unsafe, clamping to maximum safe vectorization factor 2
+; CHECK-LABEL: @foo
+; CHECK: <2 x i32>
+define void @foo(i32* %a, i32* %b) {
+entry:
+  br label %loop.ph
+
+loop.ph:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %loop.ph ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 2
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}