[PATCH] D37425: LoopVectorize: MaxVF should not be larger than the loop trip count

Sun Sep 3 07:46:28 PDT 2017

zvi created this revision.

Improve how MaxVF is computed while taking into account that MaxVF should not be larger than the loop's trip count.

Other than saving on compile-time by pruning the possible MaxVF candidates, this patch fixes pr34438 which exposed the following flow:

1. Short trip count identified -> Don't bail out, set OptForSize:=True to avoid tail-loop and runtime checks.
2. Compute MaxVF returned 16 on a target supporting AVX512.
3. OptForSize -> choose VF:=MaxVF.
4. Bail out because TripCount = 8, VF = 16, TripCount % VF !=0 means we need a tail loop.

With this patch step 2. will choose MaxVF=8 based on TripCount.


https://reviews.llvm.org/D37425

Files:
  lib/Transforms/Vectorize/LoopVectorize.cpp
  test/Transforms/LoopVectorize/X86/pr34438.ll


Index: test/Transforms/LoopVectorize/X86/pr34438.ll
===================================================================

--- /dev/null
+++ test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -0,0 +1,40 @@
+; PR34438
+; Loop has a short trip count of 8 iterations. It should be vectorized because no runtime checks or tail loop are necessary.
+; Two cases tested AVX (MaxVF=8 = TripCount) and AVX512 (MaxVF=16 > TripCount)
+
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S -vectorizer-min-trip-count=21 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=skylake-avx512 -debug-only=loop-vectorize -stats -S -vectorizer-min-trip-count=21 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: LV: Loop hints: force=?
+; No more loops in the module
+; CHECK-NOT: LV: Loop hints: force=
+; CHECK: 1 loop-vectorize               - Number of loops analyzed for vectorization
+; CHECK: 1 loop-vectorize               - Number of loops vectorized
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @small_tc(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:
+  ret void
+}
+
+!3 = !{!3}
+!4 = !{!4}
+
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1960,7 +1960,7 @@
 private:
   /// \return An upper bound for the vectorization factor, larger than zero.
   /// One is returned if vectorization should best be avoided due to cost.
-  unsigned computeFeasibleMaxVF(bool OptForSize);
+  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount = 0);
 
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
@@ -6187,7 +6187,7 @@
     return None;
   }
 
-  unsigned MaxVF = computeFeasibleMaxVF(OptForSize);
+  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
 
   if (TC % MaxVF != 0) {
     // If the trip count that we found modulo the vectorization factor is not
@@ -6208,7 +6208,9 @@
   return MaxVF;
 }
 
-unsigned LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize) {
+unsigned
+LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
+                                                 unsigned ConstTripCount) {
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -6237,7 +6239,9 @@
   if (MaxVectorSize == 0) {
     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
     MaxVectorSize = 1;
-  }
+  } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
+             isPowerOf2_32(ConstTripCount))
+    MaxVectorSize = ConstTripCount;
 
   assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
                                 " into one vector!");


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D37425.113698.patch
Type: text/x-patch
Size: 3952 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20170903/12e2e152/attachment.bin>