[PATCH] D93615: [LV] Avoid needless fold tail

Mon Dec 21 05:42:01 PST 2020

gilr updated this revision to Diff 313091.
gilr added a comment.

Add a test for a constant TC with IC=3.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D93615/new/

https://reviews.llvm.org/D93615

Files:
  llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
  llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
  llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll


Index: llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
===================================================================

--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+; Make sure loops are vectorized under -Os without folding its tail based on
+; their trip-count's lower bits being zero.
+
+; CHECK-LABEL: alignTC
+; CHECK:       vector.body:
+; CHECK:         store <4 x i32>
+
+define dso_local void @alignTC(i32* noalias nocapture %A, i32 %n) optsize {
+entry:
+  %alignedTC = and i32 %n, -8
+  br label %loop
+
+loop:
+  %riv = phi i32 [ 0, %entry ], [ %rivPlus1, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv
+  store i32 13, i32* %arrayidx, align 1
+  %rivPlus1 = add nuw nsw i32 %riv, 1
+  %cond = icmp eq i32 %rivPlus1, %alignedTC
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
Index: llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=3 -force-vector-width=2 -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+; Make sure loop is unrolled under -Os without folding its tail based on the its trip-count
+; being provably divisible by chosen VFxIC.
+
+; CHECK-LABEL:   constTC
+; CHECK:         vector.body:
+; CHECK-COUNT-3:   store <2 x i32>
+; CHECK:         br i1
+
+define dso_local void @constTC(i32* noalias nocapture %A) optsize {
+entry:
+  br label %loop
+
+loop:
+  %riv = phi i32 [ 0, %entry ], [ %rivPlus1, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv
+  store i32 13, i32* %arrayidx, align 1
+  %rivPlus1 = add nuw nsw i32 %riv, 1
+  %cond = icmp eq i32 %rivPlus1, 1800
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5508,6 +5508,19 @@
     return MaxVF;
   }
 
+  // Avoid tail folding if the trip count is known to be a multiple of any VF we
+  // chose.
+  ScalarEvolution *SE = PSE.getSE();
+  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
+  const SCEV *ExitCount = SE->getAddExpr(
+      BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+  unsigned TCisMultipleOf = 1 << SE->GetMinTrailingZeros(ExitCount);
+  if (TCisMultipleOf % MaxVFtimesIC == 0) {
+    // Accept MaxVF if we do not have a tail.
+    LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+    return MaxVF;
+  }
+
   // If we don't know the precise trip count, or if the trip count that we
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D93615.313091.patch
Type: text/x-patch
Size: 3208 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20201221/d7516794/attachment.bin>