[llvm] 384a5b0 - [LAA] Use MaxStride instead of CommonStride to calculate MaxVF (#98142)
via llvm-commits
llvm-commits at lists.llvm.org
Wed May 7 13:02:24 PDT 2025
Author: vaibhav
Date: 2025-05-07T21:02:21+01:00
New Revision: 384a5b00a7c8fffa72f7fe7021863d00da842a19
URL: https://github.com/llvm/llvm-project/commit/384a5b00a7c8fffa72f7fe7021863d00da842a19
DIFF: https://github.com/llvm/llvm-project/commit/384a5b00a7c8fffa72f7fe7021863d00da842a19.diff
LOG: [LAA] Use MaxStride instead of CommonStride to calculate MaxVF (#98142)
We bail out from MaxVF calculation if the strides are not the same.
Instead, we are dependent on runtime checks, though not yet implemented.
We could instead use the MaxStride to conservatively use an upper bound.
This handles cases like the following:
```c
#define LEN 256 * 256
float a[LEN];
void gather() {
for (int i = 0; i < LEN - 1024 - 255; i++) {
#pragma clang loop interleave(disable)
#pragma clang loop unroll(disable)
for (int j = 0; j < 256; j++)
a[i + j + 1024] += a[j * 4 + i];
}
}
```
---------
Co-authored-by: Florian Hahn <flo at fhahn.com>
Added:
llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
Modified:
llvm/lib/Analysis/LoopAccessAnalysis.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 7ec9bdb318ffb..f222a9905c3bb 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2148,10 +2148,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
"
diff erent type sizes\n");
return Dependence::Unknown;
}
-
- if (!CommonStride)
- return Dependence::Unknown;
-
// Bail out early if passed-in parameters make vectorization not feasible.
unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
VectorizerParams::VectorizationFactor : 1);
@@ -2162,7 +2158,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// It's not vectorizable if the distance is smaller than the minimum distance
// needed for a vectroized/unrolled version. Vectorizing one iteration in
- // front needs CommonStride. Vectorizing the last iteration needs TypeByteSize
+ // front needs MaxStride. Vectorizing the last iteration needs TypeByteSize.
// (No need to plus the last gap distance).
//
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
@@ -2186,11 +2182,14 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
// the minimum distance needed is 28, which is greater than distance. It is
// not safe to do vectorization.
+ //
+ // We use MaxStride (maximum of src and sink strides) to get a conservative
+ // lower bound on the MinDistanceNeeded in case of
diff erent strides.
// We know that Dist is positive, but it may not be constant. Use the signed
// minimum for computations below, as this ensures we compute the closest
// possible dependence distance.
- uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize;
+ uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize;
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
if (!ConstDist) {
// For non-constant distances, we checked the lower bound of the
@@ -2236,7 +2235,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
couldPreventStoreLoadForward(MinDistance, TypeByteSize, *CommonStride))
return Dependence::BackwardVectorizableButPreventsForwarding;
- uint64_t MaxVF = MinDepDistBytes / *CommonStride;
+ uint64_t MaxVF = MinDepDistBytes / MaxStride;
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
<< " with max VF = " << MaxVF << '\n');
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/
diff erent_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/
diff erent_strides.ll
new file mode 100644
index 0000000000000..c5f31de2f8233
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/
diff erent_strides.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<access-info>" %s 2>&1 | FileCheck %s
+
+ at a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16
+
+; Generated from the following C code:
+; #define LEN 256 * 256
+; float a[LEN];
+;
+; void
diff erent_strides() {
+; for (int i = 0; i < LEN - 1024 - 255; i++) {
+; #pragma clang loop interleave(disable)
+; #pragma clang loop unroll(disable)
+; for (int j = 0; j < 256; j++)
+; a[i + j + 1024] += a[j * 4 + i];
+; }
+; }
+; The load and store have
diff erent strides(4 and 16 bytes respectively) but the store
+; is always at safe positive distance away from the load, thus BackwardVectorizable
+define void @
diff erent_strides_backward_vectorizable() {
+; CHECK-LABEL: '
diff erent_strides_backward_vectorizable'
+; CHECK-NEXT: inner.body:
+; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2048 bits
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: BackwardVectorizable:
+; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Forward:
+; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+; CHECK-NEXT: outer.header:
+; CHECK-NEXT: Report: loop is not the innermost loop
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ br label %outer.header
+
+outer.header:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
+ %0 = add nuw nsw i64 %i, 1024
+ br label %inner.body
+
+inner.body:
+ %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
+ %1 = shl nuw nsw i64 %j, 2
+ %2 = add nuw nsw i64 %1, %i
+ %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
+ %3 = load float, ptr %arrayidx, align 4
+ %4 = add nuw nsw i64 %0, %j
+ %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
+ %5 = load float, ptr %arrayidx8, align 4
+ %add9 = fadd fast float %5, %3
+ store float %add9, ptr %arrayidx8, align 4
+ %j.next = add nuw nsw i64 %j, 1
+ %exitcond.not = icmp eq i64 %j.next, 256
+ br i1 %exitcond.not, label %outer.latch, label %inner.body
+
+outer.latch:
+ %i.next = add nuw nsw i64 %i, 1
+ %outerexitcond.not = icmp eq i64 %i.next, 64257
+ br i1 %outerexitcond.not, label %exit, label %outer.header
+
+exit:
+ ret void
+}
+
+
+; Generated from following C code:
+; void
diff erent_stride_and_not_vectorizable(){
+; for(int i = 0; i < LEN2; i++){
+; for(int j = 0 ; j < LEN; j++){
+; a[i + j + LEN] += a[i + 4*j];
+; }
+; }
+; }
+; The load and store have
diff erent strides, but the store and load are not at a
+; safe distance away from each other, thus not safe for vectorization.
+define void @
diff erent_stride_and_not_vectorizable() {
+; CHECK-LABEL: '
diff erent_stride_and_not_vectorizable'
+; CHECK-NEXT: inner.body:
+; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT: Unknown data dependence.
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Unknown:
+; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Forward:
+; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+; CHECK-NEXT: outer.header:
+; CHECK-NEXT: Report: loop is not the innermost loop
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ br label %outer.header
+
+outer.header:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
+ %0 = add nuw nsw i64 %i, 256
+ br label %inner.body
+
+inner.body:
+ %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
+ %1 = shl nuw nsw i64 %j, 2
+ %2 = add nuw nsw i64 %1, %i
+ %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
+ %3 = load float, ptr %arrayidx, align 4
+ %4 = add nuw nsw i64 %0, %j
+ %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
+ %5 = load float, ptr %arrayidx8, align 4
+ %add9 = fadd fast float %5, %3
+ store float %add9, ptr %arrayidx8, align 4
+ %j.next = add nuw nsw i64 %j, 1
+ %exitcond.not = icmp eq i64 %j.next, 256
+ br i1 %exitcond.not, label %outer.latch, label %inner.body
+
+outer.latch:
+ %i.next = add nuw nsw i64 %i, 1
+ %exitcond29.not = icmp eq i64 %i.next, 65536
+ br i1 %exitcond29.not, label %exit, label %outer.header
+
+exit:
+ ret void
+}
More information about the llvm-commits
mailing list