[llvm] [LAA] Use MaxStride instead of CommonStride to calculate MaxVF (PR #98142)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 9 06:18:48 PDT 2024
https://github.com/mrdaybird updated https://github.com/llvm/llvm-project/pull/98142
>From a52e5e6c77278812031bc4c4f68316dae611fee3 Mon Sep 17 00:00:00 2001
From: Vaibhav Pathak <pathakvaibhav at protonmail.com>
Date: Tue, 9 Jul 2024 14:42:30 +0530
Subject: [PATCH 1/3] Add test for loop with different strides
---
.../LoopAccessAnalysis/different_strides.ll | 81 +++++++++++++++++++
1 file changed, 81 insertions(+)
create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
new file mode 100644
index 0000000000000..ee88e9f540924
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --disable-output -mtriple=x86_64 --passes="print<access-info>" %s 2>&1 | FileCheck %s
+
+ at a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16
+
+; Equivalent C code for the test case:
+; #define LEN 256 * 256
+; float a[LEN];
+
+; void different_strides() {
+; for (int i = 0; i < LEN - 1024 - 255; i++) {
+; #pragma clang loop interleave(disable)
+; #pragma clang loop unroll(disable)
+; for (int j = 0; j < 256; j++)
+; a[i + j + 1024] += a[j * 4 + i];
+; }
+; }
+define dso_local void @different_strides() local_unnamed_addr {
+; CHECK-LABEL: 'different_strides'
+; CHECK-NEXT: for.body4:
+; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT: Unknown data dependence.
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Unknown:
+; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Forward:
+; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+; CHECK-NEXT: for.cond1.preheader:
+; CHECK-NEXT: Report: loop is not the innermost loop
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader:
+ %indvars.iv25 = phi i64 [ 0, %entry ], [ %indvars.iv.next26, %for.cond.cleanup3 ]
+ %0 = add nuw nsw i64 %indvars.iv25, 1024
+ br label %for.body4
+
+for.cond.cleanup:
+ ret void
+
+for.cond.cleanup3:
+ %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
+ %exitcond29.not = icmp eq i64 %indvars.iv.next26, 64257
+ br i1 %exitcond29.not, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.body4:
+ %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
+ %1 = shl nuw nsw i64 %indvars.iv, 2
+ %2 = add nuw nsw i64 %1, %indvars.iv25
+ %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
+ %3 = load float, ptr %arrayidx, align 4
+ %4 = add nuw nsw i64 %0, %indvars.iv
+ %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
+ %5 = load float, ptr %arrayidx8, align 4
+ %add9 = fadd fast float %5, %3
+ store float %add9, ptr %arrayidx8, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 256
+ br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4
+}
+
>From 47b8c2e0ecb74dad2da40325a151317de5134e3b Mon Sep 17 00:00:00 2001
From: Vaibhav Pathak <pathakvaibhav at protonmail.com>
Date: Tue, 9 Jul 2024 15:30:44 +0530
Subject: [PATCH 2/3] Use MaxStride instead of CommonStride to get Max safe VF
---
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 8 ++------
.../test/Analysis/LoopAccessAnalysis/different_strides.ll | 5 ++---
2 files changed, 4 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 018861a665c4c..3a984fafd44d3 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2133,10 +2133,6 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
"different type sizes\n");
return Dependence::Unknown;
}
-
- if (!CommonStride)
- return Dependence::Unknown;
-
// Bail out early if passed-in parameters make vectorization not feasible.
unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
VectorizerParams::VectorizationFactor : 1);
@@ -2176,7 +2172,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
// minimum for computations below, as this ensures we compute the closest
// possible dependence distance.
uint64_t MinDistanceNeeded =
- TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
+ TypeByteSize * MaxStride * (MinNumIter - 1) + TypeByteSize;
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
if (!isa<SCEVConstant>(Dist)) {
// For non-constant distances, we checked the lower bound of the
@@ -2233,7 +2229,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
// An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
// since there is a backwards dependency.
- uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
+ uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * MaxStride);
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
<< " with max VF = " << MaxVF << '\n');
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
index ee88e9f540924..fb3efc2768966 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
@@ -18,10 +18,9 @@
define dso_local void @different_strides() local_unnamed_addr {
; CHECK-LABEL: 'different_strides'
; CHECK-NEXT: for.body4:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Unknown data dependence.
+; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2048 bits
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: Unknown:
+; CHECK-NEXT: BackwardVectorizable:
; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
; CHECK-EMPTY:
>From 72c94627cae40603f87e291bcff5fe1dd3adca94 Mon Sep 17 00:00:00 2001
From: Vaibhav Pathak <pathakvaibhav at protonmail.com>
Date: Tue, 9 Jul 2024 18:48:32 +0530
Subject: [PATCH 3/3] Update comment, update prev test and add new test
---
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 8 +-
.../LoopAccessAnalysis/different_strides.ll | 126 ++++++++++++++----
2 files changed, 107 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 3a984fafd44d3..78cbb179d7cee 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2143,8 +2143,9 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
// It's not vectorizable if the distance is smaller than the minimum distance
// needed for a vectroized/unrolled version. Vectorizing one iteration in
- // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
- // TypeByteSize (No need to plus the last gap distance).
+ // front needs TypeByteSize * Stride(MaxStride in case of different strides).
+ // Vectorizing the last iteration needs TypeByteSize (No need to plus the last
+ // gap distance).
//
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
// foo(int *A) {
@@ -2167,6 +2168,9 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
// If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
// the minimum distance needed is 28, which is greater than distance. It is
// not safe to do vectorization.
+ //
+ // We use MaxStride (maximum of src and sink strides), to get conservative
+ // lower bound on the MinDistanceNeeded in case of different strides.
// We know that Dist is positive, but it may not be constant. Use the signed
// minimum for computations below, as this ensures we compute the closest
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
index fb3efc2768966..8e5562369641e 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
@@ -3,10 +3,10 @@
@a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16
-; Equivalent C code for the test case:
+; Generated from the following C code:
; #define LEN 256 * 256
; float a[LEN];
-
+;
; void different_strides() {
; for (int i = 0; i < LEN - 1024 - 255; i++) {
; #pragma clang loop interleave(disable)
@@ -15,9 +15,11 @@
; a[i + j + 1024] += a[j * 4 + i];
; }
; }
-define dso_local void @different_strides() local_unnamed_addr {
-; CHECK-LABEL: 'different_strides'
-; CHECK-NEXT: for.body4:
+; The load and store have different strides(4 and 16 bytes respectively) but the store
+; is always at safe positive distance away from the load, thus BackwardVectorizable
+define dso_local void @different_strides_backward_vectorizable() local_unnamed_addr {
+; CHECK-LABEL: 'different_strides_backward_vectorizable'
+; CHECK-NEXT: inner.body:
; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2048 bits
; CHECK-NEXT: Dependences:
; CHECK-NEXT: BackwardVectorizable:
@@ -35,7 +37,82 @@ define dso_local void @different_strides() local_unnamed_addr {
; CHECK-NEXT: SCEV assumptions:
; CHECK-EMPTY:
; CHECK-NEXT: Expressions re-written:
-; CHECK-NEXT: for.cond1.preheader:
+; CHECK-NEXT: outer.header:
+; CHECK-NEXT: Report: loop is not the innermost loop
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ br label %outer.header
+
+outer.header:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %outer.exit ]
+ %0 = add nuw nsw i64 %i, 1024
+ br label %inner.body
+
+inner.body:
+ %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
+ %1 = shl nuw nsw i64 %j, 2
+ %2 = add nuw nsw i64 %1, %i
+ %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
+ %3 = load float, ptr %arrayidx, align 4
+ %4 = add nuw nsw i64 %0, %j
+ %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
+ %5 = load float, ptr %arrayidx8, align 4
+ %add9 = fadd fast float %5, %3
+ store float %add9, ptr %arrayidx8, align 4
+ %j.next = add nuw nsw i64 %j, 1
+ %exitcond.not = icmp eq i64 %j.next, 256
+ br i1 %exitcond.not, label %outer.exit, label %inner.body
+
+outer.exit:
+ %i.next = add nuw nsw i64 %i, 1
+ %outerexitcond.not = icmp eq i64 %i.next, 64257
+ br i1 %outerexitcond.not, label %exit, label %outer.header
+
+exit:
+ ret void
+}
+
+
+; Generated from following C code:
+; void different_stride_and_not_vectorizable(){
+; for(int i = 0; i < LEN2; i++){
+; for(int j = 0 ; j < LEN; j++){
+; a[i + j + LEN] += a[i + 4*j];
+; }
+; }
+; }
+; The load and store have different strides, but the store and load are not at a
+; safe distance away from each other, thus not safe for vectorization.
+define dso_local void @different_stride_and_not_vectorizable() local_unnamed_addr {
+; CHECK-LABEL: 'different_stride_and_not_vectorizable'
+; CHECK-NEXT: inner.body:
+; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT: Unknown data dependence.
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Unknown:
+; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Forward:
+; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+; CHECK-NEXT: outer.header:
; CHECK-NEXT: Report: loop is not the innermost loop
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
@@ -47,34 +124,33 @@ define dso_local void @different_strides() local_unnamed_addr {
; CHECK-NEXT: Expressions re-written:
;
entry:
- br label %for.cond1.preheader
+ br label %outer.header
-for.cond1.preheader:
- %indvars.iv25 = phi i64 [ 0, %entry ], [ %indvars.iv.next26, %for.cond.cleanup3 ]
- %0 = add nuw nsw i64 %indvars.iv25, 1024
- br label %for.body4
+outer.header:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %outer.exit ]
+ %0 = add nuw nsw i64 %i, 256
+ br label %inner.body
-for.cond.cleanup:
+exit:
ret void
-for.cond.cleanup3:
- %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
- %exitcond29.not = icmp eq i64 %indvars.iv.next26, 64257
- br i1 %exitcond29.not, label %for.cond.cleanup, label %for.cond1.preheader
+outer.exit:
+ %i.next = add nuw nsw i64 %i, 1
+ %exitcond29.not = icmp eq i64 %i.next, 65536
+ br i1 %exitcond29.not, label %exit, label %outer.header
-for.body4:
- %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
- %1 = shl nuw nsw i64 %indvars.iv, 2
- %2 = add nuw nsw i64 %1, %indvars.iv25
+inner.body:
+ %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
+ %1 = shl nuw nsw i64 %j, 2
+ %2 = add nuw nsw i64 %1, %i
%arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
%3 = load float, ptr %arrayidx, align 4
- %4 = add nuw nsw i64 %0, %indvars.iv
+ %4 = add nuw nsw i64 %0, %j
%arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
%5 = load float, ptr %arrayidx8, align 4
%add9 = fadd fast float %5, %3
store float %add9, ptr %arrayidx8, align 4
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, 256
- br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4
+ %j.next = add nuw nsw i64 %j, 1
+ %exitcond.not = icmp eq i64 %j.next, 256
+ br i1 %exitcond.not, label %outer.exit, label %inner.body
}
-
More information about the llvm-commits
mailing list