[llvm] Use MaxStride instead of CommonStride to calculate MaxVF (PR #98142)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 9 03:15:21 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-analysis
Author: vaibhav (mrdaybird)
<details>
<summary>Changes</summary>
We bail out from safe MaxVF calculation if the strides are not the same. Instead, we are dependent on runtime checks, though not yet implemented. We could instead use the MaxStride.
This handles cases like the following:
```c
#define LEN 256 * 256
float a[LEN];
void gather() {
for (int i = 0; i < LEN - 1024 - 255; i++) {
#pragma clang loop interleave(disable)
#pragma clang loop unroll(disable)
for (int j = 0; j < 256; j++)
a[i + j + 1024] += a[j * 4 + i];
}
}
```
I am not sure about the correctness, but intuitively it felt right.
---
Full diff: https://github.com/llvm/llvm-project/pull/98142.diff
2 Files Affected:
- (modified) llvm/lib/Analysis/LoopAccessAnalysis.cpp (+2-6)
- (added) llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll (+80)
``````````diff
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 018861a665c4c..3a984fafd44d3 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2133,10 +2133,6 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
"different type sizes\n");
return Dependence::Unknown;
}
-
- if (!CommonStride)
- return Dependence::Unknown;
-
// Bail out early if passed-in parameters make vectorization not feasible.
unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
VectorizerParams::VectorizationFactor : 1);
@@ -2176,7 +2172,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
// minimum for computations below, as this ensures we compute the closest
// possible dependence distance.
uint64_t MinDistanceNeeded =
- TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
+ TypeByteSize * MaxStride * (MinNumIter - 1) + TypeByteSize;
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
if (!isa<SCEVConstant>(Dist)) {
// For non-constant distances, we checked the lower bound of the
@@ -2233,7 +2229,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
// An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
// since there is a backwards dependency.
- uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
+ uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * MaxStride);
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
<< " with max VF = " << MaxVF << '\n');
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
new file mode 100644
index 0000000000000..fb3efc2768966
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --disable-output -mtriple=x86_64 --passes="print<access-info>" %s 2>&1 | FileCheck %s
+
+ at a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16
+
+; Equivalent C code for the test case:
+; #define LEN 256 * 256
+; float a[LEN];
+
+; void different_strides() {
+; for (int i = 0; i < LEN - 1024 - 255; i++) {
+; #pragma clang loop interleave(disable)
+; #pragma clang loop unroll(disable)
+; for (int j = 0; j < 256; j++)
+; a[i + j + 1024] += a[j * 4 + i];
+; }
+; }
+define dso_local void @different_strides() local_unnamed_addr {
+; CHECK-LABEL: 'different_strides'
+; CHECK-NEXT: for.body4:
+; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2048 bits
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: BackwardVectorizable:
+; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Forward:
+; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+; CHECK-NEXT: for.cond1.preheader:
+; CHECK-NEXT: Report: loop is not the innermost loop
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ br label %for.cond1.preheader
+
+for.cond1.preheader:
+ %indvars.iv25 = phi i64 [ 0, %entry ], [ %indvars.iv.next26, %for.cond.cleanup3 ]
+ %0 = add nuw nsw i64 %indvars.iv25, 1024
+ br label %for.body4
+
+for.cond.cleanup:
+ ret void
+
+for.cond.cleanup3:
+ %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
+ %exitcond29.not = icmp eq i64 %indvars.iv.next26, 64257
+ br i1 %exitcond29.not, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.body4:
+ %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
+ %1 = shl nuw nsw i64 %indvars.iv, 2
+ %2 = add nuw nsw i64 %1, %indvars.iv25
+ %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
+ %3 = load float, ptr %arrayidx, align 4
+ %4 = add nuw nsw i64 %0, %indvars.iv
+ %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
+ %5 = load float, ptr %arrayidx8, align 4
+ %add9 = fadd fast float %5, %3
+ store float %add9, ptr %arrayidx8, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 256
+ br i1 %exitcond.not, label %for.cond.cleanup3, label %for.body4
+}
+
``````````
</details>
https://github.com/llvm/llvm-project/pull/98142
More information about the llvm-commits
mailing list