[llvm] Use MaxStride instead of CommonStride to calculate MaxVF (PR #98142)

Tue Jul 9 03:14:48 PDT 2024

https://github.com/mrdaybird created https://github.com/llvm/llvm-project/pull/98142

We bail out from safe MaxVF calculation if the strides are not the same. Instead, we are dependent on runtime checks, though not yet implemented. We could instead use the MaxStride.
This handles cases like the following:
```c
#define LEN 256 * 256
float a[LEN];

void gather() {
  for (int i = 0; i < LEN - 1024 - 255; i++) {
  #pragma clang loop interleave(disable)
  #pragma clang loop unroll(disable)
    for (int j = 0; j < 256; j++)
      a[i + j + 1024] += a[j * 4 + i];
  }
}
```
I am not sure about the correctness, but intuitively it felt right. 

>From a52e5e6c77278812031bc4c4f68316dae611fee3 Mon Sep 17 00:00:00 2001
From: Vaibhav Pathak <pathakvaibhav at protonmail.com>
Date: Tue, 9 Jul 2024 14:42:30 +0530
Subject: [PATCH 1/2] Add test for loop with different strides

---
 .../LoopAccessAnalysis/different_strides.ll   | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
new file mode 100644
index 0000000000000..ee88e9f540924
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --disable-output -mtriple=x86_64 --passes="print<access-info>" %s 2>&1 | FileCheck %s
+
+ at a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16
+
+; Equivalent C code for the test case:
+; #define LEN 256 * 256
+; float a[LEN];
+
+; void different_strides() {
+;   for (int i = 0; i < LEN - 1024 - 255; i++) {
+;   #pragma clang loop interleave(disable)
+;   #pragma clang loop unroll(disable)
+;     for (int j = 0; j < 256; j++)
+;       a[i + j + 1024] += a[j * 4 + i];
+;   }
+; }
+define dso_local void @different_strides() local_unnamed_addr {
+; CHECK-LABEL: 'different_strides'
+; CHECK-NEXT:    for.body4:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %3 = load float, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %5 = load float, ptr %arrayidx8, align 4 ->
+; CHECK-NEXT:            store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:    for.cond1.preheader:
+; CHECK-NEXT:      Report: loop is not the innermost loop
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv25 = phi i64 [ 0, %entry ], [ %indvars.iv.next26, %for.cond.cleanup3 ]
+  %0 = add nuw nsw i64 %indvars.iv25, 1024
+  br label %for.body4
+
+for.cond.cleanup:
+  ret void
+
+for.cond.cleanup3:
+  %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
+  %exitcond29.not = icmp eq i64 %indvars.iv.next26, 64257
+  br i1 %exitcond29.not, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.body4:
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %1 = shl nuw nsw i64 %indvars.iv, 2
+  %2 = add nuw nsw i64 %1, %indvars.iv25
+  %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
+  %3 = load float, ptr %arrayidx, align 4
+  %4 = add nuw nsw i64 %0, %indvars.iv
+  %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
+  %5 = load float, ptr %arrayidx8, align 4
+  %add9 = fadd fast float %5, %3
+  store float %add9, ptr %arrayidx8, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4
+}
+

>From 47b8c2e0ecb74dad2da40325a151317de5134e3b Mon Sep 17 00:00:00 2001
From: Vaibhav Pathak <pathakvaibhav at protonmail.com>
Date: Tue, 9 Jul 2024 15:30:44 +0530
Subject: [PATCH 2/2] Use MaxStride instead of CommonStride to get Max safe VF

---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp                  | 8 ++------
 .../test/Analysis/LoopAccessAnalysis/different_strides.ll | 5 ++---
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 018861a665c4c..3a984fafd44d3 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2133,10 +2133,6 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
                          "different type sizes\n");
     return Dependence::Unknown;
   }
-
-  if (!CommonStride)
-    return Dependence::Unknown;
-
   // Bail out early if passed-in parameters make vectorization not feasible.
   unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
                            VectorizerParams::VectorizationFactor : 1);
@@ -2176,7 +2172,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   // minimum for computations below, as this ensures we compute the closest
   // possible dependence distance.
   uint64_t MinDistanceNeeded =
-      TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
+      TypeByteSize * MaxStride * (MinNumIter - 1) + TypeByteSize;
   if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
     if (!isa<SCEVConstant>(Dist)) {
       // For non-constant distances, we checked the lower bound of the
@@ -2233,7 +2229,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
 
   // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
   // since there is a backwards dependency.
-  uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
+  uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * MaxStride);
   LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
                     << " with max VF = " << MaxVF << '\n');
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
index ee88e9f540924..fb3efc2768966 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
@@ -18,10 +18,9 @@
 define dso_local void @different_strides() local_unnamed_addr {
 ; CHECK-LABEL: 'different_strides'
 ; CHECK-NEXT:    for.body4:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 2048 bits
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:        BackwardVectorizable:
 ; CHECK-NEXT:            %3 = load float, ptr %arrayidx, align 4 ->
 ; CHECK-NEXT:            store float %add9, ptr %arrayidx8, align 4
 ; CHECK-EMPTY: