[llvm] [LAA] Use MaxStride instead of CommonStride to calculate MaxVF (PR #98142)

via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 12 00:06:40 PDT 2024


https://github.com/mrdaybird updated https://github.com/llvm/llvm-project/pull/98142

>From 1763f464a5552c6a96f7717e91a53871bc34a622 Mon Sep 17 00:00:00 2001
From: Vaibhav Pathak <pathakvaibhav at protonmail.com>
Date: Tue, 9 Jul 2024 14:42:30 +0530
Subject: [PATCH 1/4] [LAA] Add test for loop with different strides

---
 .../LoopAccessAnalysis/different_strides.ll   | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
new file mode 100644
index 0000000000000..ee88e9f540924
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --disable-output -mtriple=x86_64 --passes="print<access-info>" %s 2>&1 | FileCheck %s
+
+ at a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16
+
+; Equivalent C code for the test case:
+; #define LEN 256 * 256
+; float a[LEN];
+
+; void different_strides() {
+;   for (int i = 0; i < LEN - 1024 - 255; i++) {
+;   #pragma clang loop interleave(disable)
+;   #pragma clang loop unroll(disable)
+;     for (int j = 0; j < 256; j++)
+;       a[i + j + 1024] += a[j * 4 + i];
+;   }
+; }
+define dso_local void @different_strides() local_unnamed_addr {
+; CHECK-LABEL: 'different_strides'
+; CHECK-NEXT:    for.body4:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %3 = load float, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %5 = load float, ptr %arrayidx8, align 4 ->
+; CHECK-NEXT:            store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:    for.cond1.preheader:
+; CHECK-NEXT:      Report: loop is not the innermost loop
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv25 = phi i64 [ 0, %entry ], [ %indvars.iv.next26, %for.cond.cleanup3 ]
+  %0 = add nuw nsw i64 %indvars.iv25, 1024
+  br label %for.body4
+
+for.cond.cleanup:
+  ret void
+
+for.cond.cleanup3:
+  %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
+  %exitcond29.not = icmp eq i64 %indvars.iv.next26, 64257
+  br i1 %exitcond29.not, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.body4:
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %1 = shl nuw nsw i64 %indvars.iv, 2
+  %2 = add nuw nsw i64 %1, %indvars.iv25
+  %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
+  %3 = load float, ptr %arrayidx, align 4
+  %4 = add nuw nsw i64 %0, %indvars.iv
+  %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
+  %5 = load float, ptr %arrayidx8, align 4
+  %add9 = fadd fast float %5, %3
+  store float %add9, ptr %arrayidx8, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4
+}
+

>From 98e234869a9d79e2c64c8377d359c2eaebb9cb5e Mon Sep 17 00:00:00 2001
From: Vaibhav Pathak <pathakvaibhav at protonmail.com>
Date: Tue, 9 Jul 2024 15:30:44 +0530
Subject: [PATCH 2/4] [LAA] Use MaxStride instead of CommonStride to get Max
 safe VF

---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp                  | 8 ++------
 .../test/Analysis/LoopAccessAnalysis/different_strides.ll | 5 ++---
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 018861a665c4c..3a984fafd44d3 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2133,10 +2133,6 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
                          "different type sizes\n");
     return Dependence::Unknown;
   }
-
-  if (!CommonStride)
-    return Dependence::Unknown;
-
   // Bail out early if passed-in parameters make vectorization not feasible.
   unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
                            VectorizerParams::VectorizationFactor : 1);
@@ -2176,7 +2172,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   // minimum for computations below, as this ensures we compute the closest
   // possible dependence distance.
   uint64_t MinDistanceNeeded =
-      TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
+      TypeByteSize * MaxStride * (MinNumIter - 1) + TypeByteSize;
   if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
     if (!isa<SCEVConstant>(Dist)) {
       // For non-constant distances, we checked the lower bound of the
@@ -2233,7 +2229,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
 
   // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
   // since there is a backwards dependency.
-  uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
+  uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * MaxStride);
   LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
                     << " with max VF = " << MaxVF << '\n');
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
index ee88e9f540924..fb3efc2768966 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
@@ -18,10 +18,9 @@
 define dso_local void @different_strides() local_unnamed_addr {
 ; CHECK-LABEL: 'different_strides'
 ; CHECK-NEXT:    for.body4:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 2048 bits
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:        BackwardVectorizable:
 ; CHECK-NEXT:            %3 = load float, ptr %arrayidx, align 4 ->
 ; CHECK-NEXT:            store float %add9, ptr %arrayidx8, align 4
 ; CHECK-EMPTY:

>From 5f8f7035b4b946cdfc52d3d93bb11cdc929514f1 Mon Sep 17 00:00:00 2001
From: Vaibhav Pathak <pathakvaibhav at protonmail.com>
Date: Tue, 9 Jul 2024 18:48:32 +0530
Subject: [PATCH 3/4] [LAA] Update comment, update prev test and add new test

---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |   8 +-
 .../LoopAccessAnalysis/different_strides.ll   | 126 ++++++++++++++----
 2 files changed, 107 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 3a984fafd44d3..78cbb179d7cee 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2143,8 +2143,9 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
 
   // It's not vectorizable if the distance is smaller than the minimum distance
   // needed for a vectroized/unrolled version. Vectorizing one iteration in
-  // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
-  // TypeByteSize (No need to plus the last gap distance).
+  // front needs TypeByteSize * Stride(MaxStride in case of different strides).
+  // Vectorizing the last iteration needs TypeByteSize (No need to plus the last
+  // gap distance).
   //
   // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
   //      foo(int *A) {
@@ -2167,6 +2168,9 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
   // the minimum distance needed is 28, which is greater than distance. It is
   // not safe to do vectorization.
+  //
+  // We use MaxStride (maximum of src and sink strides), to get conservative
+  // lower bound on the MinDistanceNeeded in case of different strides.
 
   // We know that Dist is positive, but it may not be constant. Use the signed
   // minimum for computations below, as this ensures we compute the closest
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
index fb3efc2768966..8e5562369641e 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
@@ -3,10 +3,10 @@
 
 @a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16
 
-; Equivalent C code for the test case:
+; Generated from the following C code:
 ; #define LEN 256 * 256
 ; float a[LEN];
-
+;
 ; void different_strides() {
 ;   for (int i = 0; i < LEN - 1024 - 255; i++) {
 ;   #pragma clang loop interleave(disable)
@@ -15,9 +15,11 @@
 ;       a[i + j + 1024] += a[j * 4 + i];
 ;   }
 ; }
-define dso_local void @different_strides() local_unnamed_addr {
-; CHECK-LABEL: 'different_strides'
-; CHECK-NEXT:    for.body4:
+; The load and store have different strides(4 and 16 bytes respectively) but the store
+; is always at safe positive distance away from the load, thus BackwardVectorizable
+define dso_local void @different_strides_backward_vectorizable() local_unnamed_addr {
+; CHECK-LABEL: 'different_strides_backward_vectorizable'
+; CHECK-NEXT:    inner.body:
 ; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 2048 bits
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:        BackwardVectorizable:
@@ -35,7 +37,82 @@ define dso_local void @different_strides() local_unnamed_addr {
 ; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
-; CHECK-NEXT:    for.cond1.preheader:
+; CHECK-NEXT:    outer.header:
+; CHECK-NEXT:      Report: loop is not the innermost loop
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %outer.exit ]
+  %0 = add nuw nsw i64 %i, 1024
+  br label %inner.body
+
+inner.body:
+  %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
+  %1 = shl nuw nsw i64 %j, 2
+  %2 = add nuw nsw i64 %1, %i
+  %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
+  %3 = load float, ptr %arrayidx, align 4
+  %4 = add nuw nsw i64 %0, %j
+  %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
+  %5 = load float, ptr %arrayidx8, align 4
+  %add9 = fadd fast float %5, %3
+  store float %add9, ptr %arrayidx8, align 4
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond.not = icmp eq i64 %j.next, 256
+  br i1 %exitcond.not, label %outer.exit, label %inner.body
+
+outer.exit:
+  %i.next = add nuw nsw i64 %i, 1
+  %outerexitcond.not = icmp eq i64 %i.next, 64257
+  br i1 %outerexitcond.not, label %exit, label %outer.header
+
+exit:
+  ret void
+}
+
+
+; Generated from following C code:
+; void different_stride_and_not_vectorizable(){
+;    for(int i = 0; i < LEN2; i++){
+;        for(int j = 0 ; j < LEN; j++){
+;            a[i + j + LEN] += a[i + 4*j];
+;        }
+;    }
+; }
+; The load and store have different strides, but the store and load are not at a
+; safe distance away from each other, thus not safe for vectorization.
+define dso_local void @different_stride_and_not_vectorizable() local_unnamed_addr {
+; CHECK-LABEL: 'different_stride_and_not_vectorizable'
+; CHECK-NEXT:    inner.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %3 = load float, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %5 = load float, ptr %arrayidx8, align 4 ->
+; CHECK-NEXT:            store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:    outer.header:
 ; CHECK-NEXT:      Report: loop is not the innermost loop
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
@@ -47,34 +124,33 @@ define dso_local void @different_strides() local_unnamed_addr {
 ; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
-  br label %for.cond1.preheader
+  br label %outer.header
 
-for.cond1.preheader:
-  %indvars.iv25 = phi i64 [ 0, %entry ], [ %indvars.iv.next26, %for.cond.cleanup3 ]
-  %0 = add nuw nsw i64 %indvars.iv25, 1024
-  br label %for.body4
+outer.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %outer.exit ]
+  %0 = add nuw nsw i64 %i, 256
+  br label %inner.body
 
-for.cond.cleanup:
+exit:
   ret void
 
-for.cond.cleanup3:
-  %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
-  %exitcond29.not = icmp eq i64 %indvars.iv.next26, 64257
-  br i1 %exitcond29.not, label %for.cond.cleanup, label %for.cond1.preheader
+outer.exit:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond29.not = icmp eq i64 %i.next, 65536
+  br i1 %exitcond29.not, label %exit, label %outer.header
 
-for.body4:
-  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
-  %1 = shl nuw nsw i64 %indvars.iv, 2
-  %2 = add nuw nsw i64 %1, %indvars.iv25
+inner.body:
+  %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
+  %1 = shl nuw nsw i64 %j, 2
+  %2 = add nuw nsw i64 %1, %i
   %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
   %3 = load float, ptr %arrayidx, align 4
-  %4 = add nuw nsw i64 %0, %indvars.iv
+  %4 = add nuw nsw i64 %0, %j
   %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
   %5 = load float, ptr %arrayidx8, align 4
   %add9 = fadd fast float %5, %3
   store float %add9, ptr %arrayidx8, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, 256
-  br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond.not = icmp eq i64 %j.next, 256
+  br i1 %exitcond.not, label %outer.exit, label %inner.body
 }
-

>From d7668a713afdffe4cf23a74b06176fef3f1dd4fc Mon Sep 17 00:00:00 2001
From: Vaibhav Pathak <pathakvaibhav at protonmail.com>
Date: Fri, 12 Jul 2024 12:21:41 +0530
Subject: [PATCH 4/4] [LAA] Update test

---
 .../LoopAccessAnalysis/different_strides.ll   | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
index 8e5562369641e..94ef300dd22aa 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
@@ -52,7 +52,7 @@ entry:
   br label %outer.header
 
 outer.header:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %outer.exit ]
+  %i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
   %0 = add nuw nsw i64 %i, 1024
   br label %inner.body
 
@@ -69,9 +69,9 @@ inner.body:
   store float %add9, ptr %arrayidx8, align 4
   %j.next = add nuw nsw i64 %j, 1
   %exitcond.not = icmp eq i64 %j.next, 256
-  br i1 %exitcond.not, label %outer.exit, label %inner.body
+  br i1 %exitcond.not, label %outer.latch, label %inner.body
 
-outer.exit:
+outer.latch:
   %i.next = add nuw nsw i64 %i, 1
   %outerexitcond.not = icmp eq i64 %i.next, 64257
   br i1 %outerexitcond.not, label %exit, label %outer.header
@@ -127,18 +127,10 @@ entry:
   br label %outer.header
 
 outer.header:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %outer.exit ]
+  %i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
   %0 = add nuw nsw i64 %i, 256
   br label %inner.body
 
-exit:
-  ret void
-
-outer.exit:
-  %i.next = add nuw nsw i64 %i, 1
-  %exitcond29.not = icmp eq i64 %i.next, 65536
-  br i1 %exitcond29.not, label %exit, label %outer.header
-
 inner.body:
   %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
   %1 = shl nuw nsw i64 %j, 2
@@ -152,5 +144,13 @@ inner.body:
   store float %add9, ptr %arrayidx8, align 4
   %j.next = add nuw nsw i64 %j, 1
   %exitcond.not = icmp eq i64 %j.next, 256
-  br i1 %exitcond.not, label %outer.exit, label %inner.body
+  br i1 %exitcond.not, label %outer.latch, label %inner.body
+
+outer.latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond29.not = icmp eq i64 %i.next, 65536
+  br i1 %exitcond29.not, label %exit, label %outer.header
+
+exit:
+  ret void
 }



More information about the llvm-commits mailing list