[llvm] [LAA] Scale strides using type-size (NFC) (PR #124529)
Ramkumar Ramachandra via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 20 05:47:00 PST 2025
https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/124529
>From f169b7f4e8a47a124b6b4931eb656f9e19cd9eb9 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Mon, 27 Jan 2025 10:51:31 +0000
Subject: [PATCH 1/2] LAA: scale strides using type-size
Change getDependenceDistanceStrideAndSize to scale strides by
TypeByteSize, scaling the returned CommonStride and MaxStride. We now
correctly detect that there is indeed a common stride, when we failed to
previously due to differing type sizes.
---
.../llvm/Analysis/LoopAccessAnalysis.h | 2 +-
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 55 +++++++++----------
2 files changed, 27 insertions(+), 30 deletions(-)
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index b675c4f875448..9b997f395d186 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -370,7 +370,7 @@ class MemoryDepChecker {
/// Strides could either be scaled (in bytes, taking the size of the
/// underlying type into account), or unscaled (in indexing units; unscaled
/// stride = scaled stride / size of underlying type). Here, strides are
- /// unscaled.
+ /// scaled.
uint64_t MaxStride;
std::optional<uint64_t> CommonStride;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 7d6dbd51a404d..232e28b78fc29 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1795,8 +1795,7 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
/// }
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
const SCEV &MaxBTC, const SCEV &Dist,
- uint64_t MaxStride,
- uint64_t TypeByteSize) {
+ uint64_t MaxStride) {
// If we can prove that
// (**) |Dist| > MaxBTC * Step
@@ -1815,8 +1814,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
// will be executed only if LoopCount >= VF, proving distance >= LoopCount
// also guarantees that distance >= VF.
//
- const uint64_t ByteStride = MaxStride * TypeByteSize;
- const SCEV *Step = SE.getConstant(MaxBTC.getType(), ByteStride);
+ const SCEV *Step = SE.getConstant(MaxBTC.getType(), MaxStride);
const SCEV *Product = SE.getMulExpr(&MaxBTC, Step);
const SCEV *CastedDist = &Dist;
@@ -1860,14 +1858,12 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
if (Distance % TypeByteSize)
return false;
- uint64_t ScaledDist = Distance / TypeByteSize;
-
- // No dependence if the scaled distance is not multiple of the stride.
+ // No dependence if the distance is not multiple of the stride.
// E.g.
// for (i = 0; i < 1024 ; i += 4)
// A[i+2] = A[i] + 1;
//
- // Two accesses in memory (scaled distance is 2, stride is 4):
+ // Two accesses in memory (distance is 2, stride is 4):
// | A[0] | | | | A[4] | | | |
// | | | A[2] | | | | A[6] | |
//
@@ -1875,10 +1871,10 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
// for (i = 0; i < 1024 ; i += 3)
// A[i+4] = A[i] + 1;
//
- // Two accesses in memory (scaled distance is 4, stride is 3):
+ // Two accesses in memory (distance is 4, stride is 3):
// | A[0] | | | A[3] | | | A[6] | | |
// | | | | | A[4] | | | A[7] | |
- return ScaledDist % Stride;
+ return Distance % Stride;
}
std::variant<MemoryDepChecker::Dependence::DepType,
@@ -1987,25 +1983,27 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
return MemoryDepChecker::Dependence::Unknown;
}
- uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
- bool HasSameSize =
- DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
- if (!HasSameSize)
- TypeByteSize = 0;
+ TypeSize AStoreSz = DL.getTypeStoreSize(ATy),
+ BStoreSz = DL.getTypeStoreSize(BTy);
+
+ // If store sizes are not the same, set TypeByteSize to zero, so we can check
+ // it in the caller.
+ uint64_t ASz = DL.getTypeAllocSize(ATy), BSz = DL.getTypeAllocSize(BTy),
+ TypeByteSize = AStoreSz == BStoreSz ? BSz : 0;
- StrideAPtrInt = std::abs(StrideAPtrInt);
- StrideBPtrInt = std::abs(StrideBPtrInt);
+ uint64_t StrideAScaled = std::abs(StrideAPtrInt) * ASz;
+ uint64_t StrideBScaled = std::abs(StrideBPtrInt) * BSz;
- uint64_t MaxStride = std::max(StrideAPtrInt, StrideBPtrInt);
+ uint64_t MaxStride = std::max(StrideAScaled, StrideBScaled);
std::optional<uint64_t> CommonStride;
- if (StrideAPtrInt == StrideBPtrInt)
- CommonStride = StrideAPtrInt;
+ if (StrideAScaled == StrideBScaled)
+ CommonStride = StrideAScaled;
// TODO: Historically, we don't retry with runtime checks unless the
// (unscaled) strides are the same. Fix this once the condition for runtime
// checks in isDependent is fixed.
- bool ShouldRetryWithRuntimeCheck = CommonStride.has_value();
+ bool ShouldRetryWithRuntimeCheck = StrideAPtrInt == StrideBPtrInt;
return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
ShouldRetryWithRuntimeCheck, TypeByteSize,
@@ -2045,9 +2043,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// upper bound of the number of iterations), the accesses are independet, i.e.
// they are far enough appart that accesses won't access the same location
// across all loop ierations.
- if (HasSameSize && isSafeDependenceDistance(
- DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()),
- *Dist, MaxStride, TypeByteSize))
+ if (HasSameSize &&
+ isSafeDependenceDistance(
+ DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()), *Dist, MaxStride))
return Dependence::NoDep;
const SCEVConstant *ConstDist = dyn_cast<SCEVConstant>(Dist);
@@ -2151,8 +2149,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// It's not vectorizable if the distance is smaller than the minimum distance
// needed for a vectroized/unrolled version. Vectorizing one iteration in
- // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
- // TypeByteSize (No need to plus the last gap distance).
+ // front needs CommonStride. Vectorizing the last iteration needs TypeByteSize
+ // (No need to plus the last gap distance).
//
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
// foo(int *A) {
@@ -2179,8 +2177,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// We know that Dist is positive, but it may not be constant. Use the signed
// minimum for computations below, as this ensures we compute the closest
// possible dependence distance.
- uint64_t MinDistanceNeeded =
- TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
+ uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize;
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
if (!ConstDist) {
// For non-constant distances, we checked the lower bound of the
@@ -2236,7 +2233,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
// since there is a backwards dependency.
- uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
+ uint64_t MaxVF = MinDepDistBytes / *CommonStride;
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
<< " with max VF = " << MaxVF << '\n');
>From 5511608174602272e386515b5ceddbfbcf629581 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 20 Feb 2025 13:45:29 +0000
Subject: [PATCH 2/2] LAA: fix nits, comments (NFC)
---
.../llvm/Analysis/LoopAccessAnalysis.h | 15 ++++++-----
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 27 ++++++++++---------
2 files changed, 23 insertions(+), 19 deletions(-)
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 9b997f395d186..cb6f47e3a76be 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -367,15 +367,17 @@ class MemoryDepChecker {
struct DepDistanceStrideAndSizeInfo {
const SCEV *Dist;
- /// Strides could either be scaled (in bytes, taking the size of the
- /// underlying type into account), or unscaled (in indexing units; unscaled
- /// stride = scaled stride / size of underlying type). Here, strides are
- /// scaled.
+ /// Strides here are scaled; i.e. in bytes, taking the size of the
+ /// underlying type into account.
uint64_t MaxStride;
std::optional<uint64_t> CommonStride;
bool ShouldRetryWithRuntimeCheck;
+
+ /// TypeByteSize is either the common store size of both accesses, or 0 when
+ /// store sizes mismatch.
uint64_t TypeByteSize;
+
bool AIsWrite;
bool BIsWrite;
@@ -394,8 +396,9 @@ class MemoryDepChecker {
/// there's no dependence or the analysis fails. Outlined to lambda to limit
/// he scope of various temporary variables, like A/BPtr, StrideA/BPtr and
/// others. Returns either the dependence result, if it could already be
- /// determined, or a struct containing (Distance, Stride, TypeSize, AIsWrite,
- /// BIsWrite).
+ /// determined, or a DepDistanceStrideAndSizeInfo struct, noting that
+ /// TypeByteSize could be 0 when store sizes mismatch, and this should be
+ /// checked in the caller.
std::variant<Dependence::DepType, DepDistanceStrideAndSizeInfo>
getDependenceDistanceStrideAndSize(const MemAccessInfo &A, Instruction *AInst,
const MemAccessInfo &B,
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 232e28b78fc29..8267bdc0044a8 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1781,14 +1781,14 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
Status = S;
}
-/// Given a dependence-distance \p Dist between two
-/// memory accesses, that have strides in the same direction whose absolute
-/// value of the maximum stride is given in \p MaxStride, and that have the same
-/// type size \p TypeByteSize, in a loop whose maximum backedge taken count is
-/// \p MaxBTC, check if it is possible to prove statically that the dependence
+/// Given a dependence-distance \p Dist between two memory accesses, that have
+/// strides in the same direction whose absolute value of the maximum stride is
+/// given in \p MaxStride, in a loop whose maximum backedge taken count is \p
+/// MaxBTC, check if it is possible to prove statically that the dependence
/// distance is larger than the range that the accesses will travel through the
/// execution of the loop. If so, return true; false otherwise. This is useful
/// for example in loops such as the following (PR31098):
+///
/// for (i = 0; i < D; ++i) {
/// = out[i];
/// out[i+D] =
@@ -1844,8 +1844,8 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
}
/// Check the dependence for two accesses with the same stride \p Stride.
-/// \p Distance is the positive distance and \p TypeByteSize is type size in
-/// bytes.
+/// \p Distance is the positive distance in bytes, and \p TypeByteSize is type
+/// size in bytes.
///
/// \returns true if they are independent.
static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
@@ -1983,13 +1983,14 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
return MemoryDepChecker::Dependence::Unknown;
}
- TypeSize AStoreSz = DL.getTypeStoreSize(ATy),
- BStoreSz = DL.getTypeStoreSize(BTy);
+ TypeSize AStoreSz = DL.getTypeStoreSize(ATy);
+ TypeSize BStoreSz = DL.getTypeStoreSize(BTy);
// If store sizes are not the same, set TypeByteSize to zero, so we can check
- // it in the caller.
- uint64_t ASz = DL.getTypeAllocSize(ATy), BSz = DL.getTypeAllocSize(BTy),
- TypeByteSize = AStoreSz == BStoreSz ? BSz : 0;
+ // it in the caller isDependent.
+ uint64_t ASz = DL.getTypeAllocSize(ATy);
+ uint64_t BSz = DL.getTypeAllocSize(BTy);
+ uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : 0;
uint64_t StrideAScaled = std::abs(StrideAPtrInt) * ASz;
uint64_t StrideBScaled = std::abs(StrideBPtrInt) * BSz;
@@ -2159,7 +2160,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// B[i] = A[i] + 1;
// }
//
- // Two accesses in memory (stride is 2):
+ // Two accesses in memory (stride is 4 * 2):
// | A[0] | | A[2] | | A[4] | | A[6] | |
// | B[0] | | B[2] | | B[4] |
//
More information about the llvm-commits
mailing list