[llvm] [SCEV][LV] Add Stride equal to one Predicate to enable strided access versioning (PR #77287)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 8 01:00:08 PST 2024

llvmbot wrote:



Author: None (ShivaChen)


There is a case in TSVC didn't be vectorized due to the BECount is unknown.

    float  s172(int xa, int xb)  {
      for (int i = xa - 1; i < 32000; i += xb)
         a[i] += b[i];

By assuming the stride as one and generating the runtime checking to guard the vectorized loop, it seems the case can be vectorized.

Full diff: https://github.com/llvm/llvm-project/pull/77287.diff

2 Files Affected:

- (modified) llvm/lib/Analysis/ScalarEvolution.cpp (+14-1) 
- (modified) llvm/test/Transforms/LoopVectorize/version-mem-access.ll (+52) 

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 623814c038a78f..3c712ead953186 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -12778,10 +12778,23 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
     // The positive stride case is the same as isKnownPositive(Stride) returning
     // true (original behavior of the function).
-    if (PredicatedIV || !NoWrap || !loopIsFiniteByAssumption(L) ||
+    if (PredicatedIV || !loopIsFiniteByAssumption(L) ||
       return getCouldNotCompute();
+    // Adding Stride equal to one Predicate when there is no wrap flags.
+    // It might enable strided access versioning in LAA and calculate BECount
+    // with Stride = 1.
+    if (!NoWrap) {
+      if (AllowPredicates) {
+        const auto *One =
+            static_cast<const SCEVConstant *>(getOne(Stride->getType()));
+        Predicates.insert(getEqualPredicate(Stride, One));
+        Stride = One;
+      } else
+        return getCouldNotCompute();
+    }
     if (!isKnownNonZero(Stride)) {
       // If we have a step of zero, and RHS isn't invariant in L, we don't know
       // if it might eventually be greater than start and if so, on which
diff --git a/llvm/test/Transforms/LoopVectorize/version-mem-access.ll b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll
index 7bf4fbd89b0eea..f1283365ef52a4 100644
--- a/llvm/test/Transforms/LoopVectorize/version-mem-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll
@@ -92,3 +92,55 @@ for.end.loopexit:
   ret void
+; We can vectorize the loop by using stride = 1 to calculate iteration count
+; and generate the runtime check to guard the vectorized loop.
+; CHECK-LABEL: s172
+; CHECK-DAG: icmp ne i32 %xb, 1
+; CHECK: vector.body
+ at b = global [32000 x float] zeroinitializer, align 64
+ at a = global [32000 x float] zeroinitializer, align 64
+; for (int i = xa - 1; i < 32000; i += xb)
+;   a[i] += b[i];
+define float @s172(i32 signext %xa, i32 signext %xb) mustprogress {
+  %cmp214 = icmp slt i32 %xa, 32001
+  br i1 %cmp214, label %for.body.us.preheader, label %for.cond.cleanup
+for.body.us.preheader:                            ; preds = %entry
+  %sub = add i32 %xa, -1
+  %0 = sext i32 %sub to i64
+  %1 = sext i32 %xb to i64
+  br label %for.body.us
+for.body.us:                                      ; preds = %for.body.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us
+  %nl.016.us = phi i32 [ %inc.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ]
+  br label %for.body4.us
+for.body4.us:                                     ; preds = %for.body.us, %for.body4.us
+  %indvars.iv = phi i64 [ %0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ]
+  %arrayidx.us = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv
+  %2 = load float, ptr %arrayidx.us, align 4
+  %arrayidx6.us = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
+  %3 = load float, ptr %arrayidx6.us, align 4
+  %add.us = fadd fast float %3, %2
+  store float %add.us, ptr %arrayidx6.us, align 4
+  %indvars.iv.next = add i64 %indvars.iv, %1
+  %cmp2.us = icmp slt i64 %indvars.iv.next, 32000
+  br i1 %cmp2.us, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us
+  %inc.us = add nuw nsw i32 %nl.016.us, 1
+  %exitcond.not = icmp eq i32 %inc.us, 100000
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body.us
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us
+  br label %for.cond.cleanup
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret float undef




