[llvm] LAA: scale strides using type-size (PR #124529)
Ramkumar Ramachandra via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 27 04:01:46 PST 2025
https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/124529
>From 5e3e565b1002fd0f60c771b431811098b0562eaf Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Mon, 27 Jan 2025 10:51:31 +0000
Subject: [PATCH 1/2] LAA: scale strides using type-size
Change getDependenceDistanceStrideAndSize to scale strides by
TypeByteSize, scaling the returned CommonStride and MaxStride. We now
correctly detect that there is indeed a common stride, when we failed to
previously due to differing type sizes.
---
.../llvm/Analysis/LoopAccessAnalysis.h | 2 +-
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 72 +++++++++----------
.../forward-loop-carried.ll | 4 --
.../stride-access-dependence.ll | 38 ++--------
4 files changed, 41 insertions(+), 75 deletions(-)
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 31374a128856c7..3059cfcb9bbccd 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -370,7 +370,7 @@ class MemoryDepChecker {
/// Strides could either be scaled (in bytes, taking the size of the
/// underlying type into account), or unscaled (in indexing units; unscaled
/// stride = scaled stride / size of underlying type). Here, strides are
- /// unscaled.
+ /// scaled.
uint64_t MaxStride;
std::optional<uint64_t> CommonStride;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 2a68979add666d..571e63a3ccb46c 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1799,8 +1799,7 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
/// }
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
const SCEV &MaxBTC, const SCEV &Dist,
- uint64_t MaxStride,
- uint64_t TypeByteSize) {
+ uint64_t MaxStride) {
// If we can prove that
// (**) |Dist| > MaxBTC * Step
@@ -1819,8 +1818,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
// will be executed only if LoopCount >= VF, proving distance >= LoopCount
// also guarantees that distance >= VF.
//
- const uint64_t ByteStride = MaxStride * TypeByteSize;
- const SCEV *Step = SE.getConstant(MaxBTC.getType(), ByteStride);
+ const SCEV *Step = SE.getConstant(MaxBTC.getType(), MaxStride);
const SCEV *Product = SE.getMulExpr(&MaxBTC, Step);
const SCEV *CastedDist = &Dist;
@@ -1854,24 +1852,16 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
/// bytes.
///
/// \returns true if they are independent.
-static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
- uint64_t TypeByteSize) {
+static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride) {
assert(Stride > 1 && "The stride must be greater than 1");
- assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
assert(Distance > 0 && "The distance must be non-zero");
- // Skip if the distance is not multiple of type byte size.
- if (Distance % TypeByteSize)
- return false;
-
- uint64_t ScaledDist = Distance / TypeByteSize;
-
- // No dependence if the scaled distance is not multiple of the stride.
+ // No dependence if the distance is not multiple of the stride.
// E.g.
// for (i = 0; i < 1024 ; i += 4)
// A[i+2] = A[i] + 1;
//
- // Two accesses in memory (scaled distance is 2, stride is 4):
+ // Two accesses in memory (distance is 2, stride is 4):
// | A[0] | | | | A[4] | | | |
// | | | A[2] | | | | A[6] | |
//
@@ -1879,10 +1869,10 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
// for (i = 0; i < 1024 ; i += 3)
// A[i+4] = A[i] + 1;
//
- // Two accesses in memory (scaled distance is 4, stride is 3):
+ // Two accesses in memory (distance is 4, stride is 3):
// | A[0] | | | A[3] | | | A[6] | | |
// | | | | | A[4] | | | A[7] | |
- return ScaledDist % Stride;
+ return Distance % Stride;
}
std::variant<MemoryDepChecker::Dependence::DepType,
@@ -1990,25 +1980,34 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
return MemoryDepChecker::Dependence::Unknown;
}
- uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
- bool HasSameSize =
- DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
- if (!HasSameSize)
- TypeByteSize = 0;
+ TypeSize AStoreSz = DL.getTypeStoreSize(ATy),
+ BStoreSz = DL.getTypeStoreSize(BTy);
+
+ // Fail early if either store size is scalable.
+ if (AStoreSz.isScalable() || BStoreSz.isScalable())
+ return MemoryDepChecker::Dependence::Unknown;
+
+ // The TypeByteSize is used to scale Distance and VF. In these contexts, the
+ // only size that matters is the size of the Sink. If store sizes are not the
+ // same, set TypeByteSize to zero, so we can check it in the caller.
+ uint64_t ASz = alignTo(AStoreSz, DL.getABITypeAlign(ATy)),
+ TypeByteSize = AStoreSz == BStoreSz
+ ? alignTo(BStoreSz, DL.getABITypeAlign(BTy))
+ : 0;
- StrideAPtrInt = std::abs(StrideAPtrInt);
- StrideBPtrInt = std::abs(StrideBPtrInt);
+ uint64_t StrideAScaled = std::abs(StrideAPtrInt) * ASz;
+ uint64_t StrideBScaled = std::abs(StrideBPtrInt) * TypeByteSize;
- uint64_t MaxStride = std::max(StrideAPtrInt, StrideBPtrInt);
+ uint64_t MaxStride = std::max(StrideAScaled, StrideBScaled);
std::optional<uint64_t> CommonStride;
- if (StrideAPtrInt == StrideBPtrInt)
- CommonStride = StrideAPtrInt;
+ if (StrideAScaled == StrideBScaled)
+ CommonStride = StrideAScaled;
// TODO: Historically, we don't retry with runtime checks unless the
// (unscaled) strides are the same. Fix this once the condition for runtime
// checks in isDependent is fixed.
- bool ShouldRetryWithRuntimeCheck = CommonStride.has_value();
+ bool ShouldRetryWithRuntimeCheck = StrideAPtrInt == StrideBPtrInt;
return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
ShouldRetryWithRuntimeCheck, TypeByteSize,
@@ -2048,9 +2047,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// upper bound of the number of iterations), the accesses are independet, i.e.
// they are far enough appart that accesses won't access the same location
// across all loop ierations.
- if (HasSameSize && isSafeDependenceDistance(
- DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()),
- *Dist, MaxStride, TypeByteSize))
+ if (HasSameSize &&
+ isSafeDependenceDistance(
+ DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()), *Dist, MaxStride))
return Dependence::NoDep;
const SCEVConstant *ConstDist = dyn_cast<SCEVConstant>(Dist);
@@ -2062,7 +2061,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// If the distance between accesses and their strides are known constants,
// check whether the accesses interlace each other.
if (Distance > 0 && CommonStride && CommonStride > 1 && HasSameSize &&
- areStridedAccessesIndependent(Distance, *CommonStride, TypeByteSize)) {
+ areStridedAccessesIndependent(Distance, *CommonStride)) {
LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
return Dependence::NoDep;
}
@@ -2154,8 +2153,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// It's not vectorizable if the distance is smaller than the minimum distance
// needed for a vectroized/unrolled version. Vectorizing one iteration in
- // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
- // TypeByteSize (No need to plus the last gap distance).
+ // front needs CommonStride. Vectorizing the last iteration needs TypeByteSize
+ // (No need to plus the last gap distance).
//
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
// foo(int *A) {
@@ -2182,8 +2181,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// We know that Dist is positive, but it may not be constant. Use the signed
// minimum for computations below, as this ensures we compute the closest
// possible dependence distance.
- uint64_t MinDistanceNeeded =
- TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
+ uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize;
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
if (!ConstDist) {
// For non-constant distances, we checked the lower bound of the
@@ -2239,7 +2237,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
// since there is a backwards dependency.
- uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
+ uint64_t MaxVF = MinDepDistBytes / *CommonStride;
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
<< " with max VF = " << MaxVF << '\n');
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll
index adfd19923e921c..455b23c4a8637c 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll
@@ -62,10 +62,6 @@ define void @forward_different_access_sizes(ptr readnone %end, ptr %start) {
; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding.
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: Forward:
-; CHECK-NEXT: %l = load i24, ptr %gep.1, align 1 ->
-; CHECK-NEXT: store i24 %l, ptr %ptr.iv, align 1
-; CHECK-EMPTY:
; CHECK-NEXT: ForwardButPreventsForwarding:
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
; CHECK-NEXT: %l = load i24, ptr %gep.1, align 1
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
index ef19e173b65994..cac29a0a2cb10f 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
@@ -416,13 +416,8 @@ for.body: ; preds = %entry, %for.body
define void @vectorizable_unscaled_Read_Write(ptr nocapture %A) {
; CHECK-LABEL: 'vectorizable_unscaled_Read_Write'
; CHECK-NEXT: for.body:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: BackwardVectorizableButPreventsForwarding:
-; CHECK-NEXT: %0 = load i32, ptr %arrayidx, align 4 ->
-; CHECK-NEXT: store i32 %add, ptr %arrayidx2, align 4
-; CHECK-EMPTY:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
@@ -464,12 +459,8 @@ for.body: ; preds = %entry, %for.body
define i32 @vectorizable_unscaled_Write_Read(ptr nocapture %A) {
; CHECK-LABEL: 'vectorizable_unscaled_Write_Read'
; CHECK-NEXT: for.body:
-; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 64 bits
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: BackwardVectorizable:
-; CHECK-NEXT: store i32 %0, ptr %arrayidx, align 4 ->
-; CHECK-NEXT: %1 = load i32, ptr %arrayidx2, align 4
-; CHECK-EMPTY:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
@@ -508,13 +499,8 @@ for.body: ; preds = %entry, %for.body
define void @unsafe_unscaled_Read_Write(ptr nocapture %A) {
; CHECK-LABEL: 'unsafe_unscaled_Read_Write'
; CHECK-NEXT: for.body:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Backward loop carried data dependence.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: Backward:
-; CHECK-NEXT: %0 = load i32, ptr %arrayidx, align 4 ->
-; CHECK-NEXT: store i32 %add, ptr %arrayidx2, align 4
-; CHECK-EMPTY:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
@@ -551,13 +537,8 @@ for.body: ; preds = %entry, %for.body
define void @unsafe_unscaled_Read_Write2(ptr nocapture %A) {
; CHECK-LABEL: 'unsafe_unscaled_Read_Write2'
; CHECK-NEXT: for.body:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Backward loop carried data dependence.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: Backward:
-; CHECK-NEXT: %0 = load i32, ptr %arrayidx, align 4 ->
-; CHECK-NEXT: store i32 %add, ptr %arrayidx2, align 4
-; CHECK-EMPTY:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
@@ -602,17 +583,8 @@ for.body: ; preds = %entry, %for.body
define void @interleaved_stores(ptr nocapture %A) {
; CHECK-LABEL: 'interleaved_stores'
; CHECK-NEXT: for.body:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Backward loop carried data dependence.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: Backward:
-; CHECK-NEXT: store i32 %2, ptr %arrayidx5, align 4 ->
-; CHECK-NEXT: store i32 %2, ptr %arrayidx9, align 4
-; CHECK-EMPTY:
-; CHECK-NEXT: Backward:
-; CHECK-NEXT: store i32 %0, ptr %arrayidx2, align 4 ->
-; CHECK-NEXT: store i32 %2, ptr %arrayidx5, align 4
-; CHECK-EMPTY:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
>From ba5a63e622d2d3a777c7f48cc7616cca92fb21d4 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Mon, 27 Jan 2025 12:01:17 +0000
Subject: [PATCH 2/2] LAA: fix thinko
---
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 12 +++++-------
1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 571e63a3ccb46c..fe68f2447bbe25 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1987,16 +1987,14 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
if (AStoreSz.isScalable() || BStoreSz.isScalable())
return MemoryDepChecker::Dependence::Unknown;
- // The TypeByteSize is used to scale Distance and VF. In these contexts, the
- // only size that matters is the size of the Sink. If store sizes are not the
- // same, set TypeByteSize to zero, so we can check it in the caller.
+ // If store sizes are not the same, set TypeByteSize to zero, so we can check
+ // it in the caller.
uint64_t ASz = alignTo(AStoreSz, DL.getABITypeAlign(ATy)),
- TypeByteSize = AStoreSz == BStoreSz
- ? alignTo(BStoreSz, DL.getABITypeAlign(BTy))
- : 0;
+ BSz = alignTo(BStoreSz, DL.getABITypeAlign(BTy)),
+ TypeByteSize = AStoreSz == BStoreSz ? BSz : 0;
uint64_t StrideAScaled = std::abs(StrideAPtrInt) * ASz;
- uint64_t StrideBScaled = std::abs(StrideBPtrInt) * TypeByteSize;
+ uint64_t StrideBScaled = std::abs(StrideBPtrInt) * BSz;
uint64_t MaxStride = std::max(StrideAScaled, StrideBScaled);
More information about the llvm-commits
mailing list