[llvm] [SCEV][LV] Add Stride equal to one Predicate to enable strided access versioning (PR #77287)

Mon Jan 8 00:59:42 PST 2024

https://github.com/ShivaChen created https://github.com/llvm/llvm-project/pull/77287

There is a case in TSVC didn't be vectorized due to the BECount is unknown.

    float  s172(int xa, int xb)  {
      for (int i = xa - 1; i < 32000; i += xb)
         a[i] += b[i];
    }

By assuming the stride as one and generating the runtime checking to guard the vectorized loop, it seems the case can be vectorized.

>From 18fab95c34ba6ccae8f1b34ba48a2c7b2b508804 Mon Sep 17 00:00:00 2001
From: Shiva Chen <shiva.chen at imgtec.com>
Date: Thu, 4 Jan 2024 07:40:11 +0000
Subject: [PATCH 1/2] Add s172() to version-mem-access.ll

---
 .../LoopVectorize/version-mem-access.ll       | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/version-mem-access.ll b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll
index 7bf4fbd89b0eea..353c0185cfae03 100644
--- a/llvm/test/Transforms/LoopVectorize/version-mem-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll
@@ -92,3 +92,51 @@ for.end.loopexit:
 for.end:
   ret void
 }
+
+; CHECK-LABEL: s172
+; CHECK-NOT: vector.body
+
+ at b = global [32000 x float] zeroinitializer, align 64
+ at a = global [32000 x float] zeroinitializer, align 64
+
+; for (int i = xa - 1; i < 32000; i += xb)
+;   a[i] += b[i];
+;
+define float @s172(i32 signext %xa, i32 signext %xb) mustprogress {
+entry:
+  %cmp214 = icmp slt i32 %xa, 32001
+  br i1 %cmp214, label %for.body.us.preheader, label %for.cond.cleanup
+
+for.body.us.preheader:                            ; preds = %entry
+  %sub = add i32 %xa, -1
+  %0 = sext i32 %sub to i64
+  %1 = sext i32 %xb to i64
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.body.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us
+  %nl.016.us = phi i32 [ %inc.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.body.us.preheader ]
+  br label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body.us, %for.body4.us
+  %indvars.iv = phi i64 [ %0, %for.body.us ], [ %indvars.iv.next, %for.body4.us ]
+  %arrayidx.us = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv
+  %2 = load float, ptr %arrayidx.us, align 4
+  %arrayidx6.us = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
+  %3 = load float, ptr %arrayidx6.us, align 4
+  %add.us = fadd fast float %3, %2
+  store float %add.us, ptr %arrayidx6.us, align 4
+  %indvars.iv.next = add i64 %indvars.iv, %1
+  %cmp2.us = icmp slt i64 %indvars.iv.next, 32000
+  br i1 %cmp2.us, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us
+  %inc.us = add nuw nsw i32 %nl.016.us, 1
+  %exitcond.not = icmp eq i32 %inc.us, 100000
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body.us
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret float undef
+}

>From a88b09a6654e00376f7a3777f6e1ab4883a15899 Mon Sep 17 00:00:00 2001
From: Shiva Chen <shiva.chen at imgtec.com>
Date: Thu, 28 Dec 2023 06:41:20 +0000
Subject: [PATCH 2/2] [SCEV][LV] Add Stride equal to one Predicate to enable
 strided access versioning

---
 llvm/lib/Analysis/ScalarEvolution.cpp             | 15 ++++++++++++++-
 .../LoopVectorize/version-mem-access.ll           |  6 +++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 623814c038a78f..3c712ead953186 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -12778,10 +12778,23 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
     // The positive stride case is the same as isKnownPositive(Stride) returning
     // true (original behavior of the function).
     //
-    if (PredicatedIV || !NoWrap || !loopIsFiniteByAssumption(L) ||
+    if (PredicatedIV || !loopIsFiniteByAssumption(L) ||
         !loopHasNoAbnormalExits(L))
       return getCouldNotCompute();
 
+    // Adding Stride equal to one Predicate when there is no wrap flags.
+    // It might enable strided access versioning in LAA and calculate BECount
+    // with Stride = 1.
+    if (!NoWrap) {
+      if (AllowPredicates) {
+        const auto *One =
+            static_cast<const SCEVConstant *>(getOne(Stride->getType()));
+        Predicates.insert(getEqualPredicate(Stride, One));
+        Stride = One;
+      } else
+        return getCouldNotCompute();
+    }
+
     if (!isKnownNonZero(Stride)) {
       // If we have a step of zero, and RHS isn't invariant in L, we don't know
       // if it might eventually be greater than start and if so, on which
diff --git a/llvm/test/Transforms/LoopVectorize/version-mem-access.ll b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll
index 353c0185cfae03..f1283365ef52a4 100644
--- a/llvm/test/Transforms/LoopVectorize/version-mem-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll
@@ -93,8 +93,12 @@ for.end:
   ret void
 }
 
+; We can vectorize the loop by using stride = 1 to calculate iteration count
+; and generate the runtime check to guard the vectorized loop.
+
 ; CHECK-LABEL: s172
-; CHECK-NOT: vector.body
+; CHECK-DAG: icmp ne i32 %xb, 1
+; CHECK: vector.body
 
 @b = global [32000 x float] zeroinitializer, align 64
 @a = global [32000 x float] zeroinitializer, align 64