[llvm] LAA: be more precise on different store sizes (PR #122318)
Ramkumar Ramachandra via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 13 05:29:13 PST 2025
https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/122318
>From 2103c895366e5fddac821179e25bf4f9c3451da7 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 9 Jan 2025 16:50:04 +0000
Subject: [PATCH] LAA: be more precise on different store sizes
The HasSameSize checks, which are triggered on different store sizes, in
MemDepChecker::isDependent are ad-hoc and imprecise, leading to spurious
dependencies and runtime-checks. Identify that the exact scenario in
which to bail out is unequal store sizes when dependence distance is
zero, and check precisely this condition in
MemDepChecker::getDependenceDistanceAndSize, eliminating all the ad-hoc
checks in isDependent and making LoopAccessAnalysis more precise.
---
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 53 +++++++++----------
.../LoopAccessAnalysis/depend_diff_types.ll | 10 +---
.../forward-loop-carried.ll | 4 --
...interleave-allocsize-not-equal-typesize.ll | 41 +++++++++++---
4 files changed, 60 insertions(+), 48 deletions(-)
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 2a68979add666d..f624abf75f5210 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1990,11 +1990,24 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
return MemoryDepChecker::Dependence::Unknown;
}
- uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
- bool HasSameSize =
- DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
- if (!HasSameSize)
- TypeByteSize = 0;
+ // When the distance is zero, we're reading/writing the same memory location:
+ // check that the store sizes are equal. Otherwise, fail with an unknown
+ // dependence for which we should not generate runtime checks.
+ TypeSize AStoreSz = DL.getTypeStoreSize(ATy),
+ BStoreSz = DL.getTypeStoreSize(BTy);
+ if (Dist->isZero() && AStoreSz != BStoreSz) {
+ LLVM_DEBUG(
+ dbgs() << "LAA: zero dependence distance but different type sizes\n");
+ return Dependence::Unknown;
+ }
+
+ // Bail early on scalabale store sizes.
+ if (AStoreSz.isScalable() || BStoreSz.isScalable())
+ return Dependence::Unknown;
+
+ // The TypeByteSize is used to scale Distance and VF. In these contexts, the
+ // only size that matters is the size of the Sink.
+ uint64_t TypeByteSize = alignTo(BStoreSz, DL.getABITypeAlign(BTy).value());
StrideAPtrInt = std::abs(StrideAPtrInt);
StrideBPtrInt = std::abs(StrideBPtrInt);
@@ -2030,7 +2043,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
auto &[Dist, MaxStride, CommonStride, ShouldRetryWithRuntimeCheck,
TypeByteSize, AIsWrite, BIsWrite] =
std::get<DepDistanceStrideAndSizeInfo>(Res);
- bool HasSameSize = TypeByteSize > 0;
if (isa<SCEVCouldNotCompute>(Dist)) {
// TODO: Relax requirement that there is a common unscaled stride to retry
@@ -2048,9 +2060,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// upper bound of the number of iterations), the accesses are independet, i.e.
// they are far enough appart that accesses won't access the same location
// across all loop ierations.
- if (HasSameSize && isSafeDependenceDistance(
- DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()),
- *Dist, MaxStride, TypeByteSize))
+ if (isSafeDependenceDistance(DL, SE,
+ *(PSE.getSymbolicMaxBackedgeTakenCount()), *Dist,
+ MaxStride, TypeByteSize))
return Dependence::NoDep;
const SCEVConstant *ConstDist = dyn_cast<SCEVConstant>(Dist);
@@ -2061,7 +2073,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// If the distance between accesses and their strides are known constants,
// check whether the accesses interlace each other.
- if (Distance > 0 && CommonStride && CommonStride > 1 && HasSameSize &&
+ if (Distance > 0 && CommonStride && CommonStride > 1 &&
areStridedAccessesIndependent(Distance, *CommonStride, TypeByteSize)) {
LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
return Dependence::NoDep;
@@ -2075,15 +2087,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
// Negative distances are not plausible dependencies.
if (SE.isKnownNonPositive(Dist)) {
- if (SE.isKnownNonNegative(Dist)) {
- if (HasSameSize) {
- // Write to the same location with the same size.
- return Dependence::Forward;
- }
- LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
- "different type sizes\n");
- return Dependence::Unknown;
- }
+ if (SE.isKnownNonNegative(Dist))
+ // Write to the same location with the same size.
+ return Dependence::Forward;
bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
// Check if the first access writes to a location that is read in a later
@@ -2103,8 +2109,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
return Dependence::Unknown;
}
- if (!HasSameSize ||
- couldPreventStoreLoadForward(
+ if (couldPreventStoreLoadForward(
ConstDist->getAPInt().abs().getZExtValue(), TypeByteSize)) {
LLVM_DEBUG(
dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
@@ -2135,12 +2140,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
}
- if (!HasSameSize) {
- LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
- "different type sizes\n");
- return Dependence::Unknown;
- }
-
if (!CommonStride)
return Dependence::Unknown;
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
index e855578e794fa9..1bf80489b1ea06 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
@@ -129,16 +129,8 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
; CHECK-LABEL: 'neg_dist_dep_type_size_equivalence'
; CHECK-NEXT: loop:
; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Unknown data dependence.
+; CHECK-NEXT: Backward loop carried data dependence that prevents store-to-load forwarding.
; CHECK-NEXT: Dependences:
-; CHECK-NEXT: Unknown:
-; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 ->
-; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
-; CHECK-EMPTY:
-; CHECK-NEXT: Unknown:
-; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 ->
-; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
-; CHECK-EMPTY:
; CHECK-NEXT: BackwardVectorizableButPreventsForwarding:
; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 ->
; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll
index adfd19923e921c..7837c20f003e24 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll
@@ -70,10 +70,6 @@ define void @forward_different_access_sizes(ptr readnone %end, ptr %start) {
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
; CHECK-NEXT: %l = load i24, ptr %gep.1, align 1
; CHECK-EMPTY:
-; CHECK-NEXT: Forward:
-; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
-; CHECK-NEXT: store i24 %l, ptr %ptr.iv, align 1
-; CHECK-EMPTY:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll
index bd77f9779b680d..6451aa96b04da4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll
@@ -35,10 +35,10 @@ define void @pr58722_load_interleave_group(ptr %src, ptr %dst) {
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 1
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 1
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 1
-; CHECK-NEXT: [[TMP13:%.*]] = load i24, ptr [[TMP9]], align 4, !alias.scope !0
-; CHECK-NEXT: [[TMP14:%.*]] = load i24, ptr [[TMP10]], align 4, !alias.scope !0
-; CHECK-NEXT: [[TMP15:%.*]] = load i24, ptr [[TMP11]], align 4, !alias.scope !0
-; CHECK-NEXT: [[TMP16:%.*]] = load i24, ptr [[TMP12]], align 4, !alias.scope !0
+; CHECK-NEXT: [[TMP13:%.*]] = load i24, ptr [[TMP9]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT: [[TMP14:%.*]] = load i24, ptr [[TMP10]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT: [[TMP15:%.*]] = load i24, ptr [[TMP11]], align 4, !alias.scope [[META0]]
+; CHECK-NEXT: [[TMP16:%.*]] = load i24, ptr [[TMP12]], align 4, !alias.scope [[META0]]
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i24> poison, i24 [[TMP13]], i32 0
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i24> [[TMP17]], i24 [[TMP14]], i32 1
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i24> [[TMP18]], i24 [[TMP15]], i32 2
@@ -47,7 +47,7 @@ define void @pr58722_load_interleave_group(ptr %src, ptr %dst) {
; CHECK-NEXT: [[TMP22:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0
-; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[TMP24]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[TMP24]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -96,17 +96,42 @@ exit:
define void @pr58722_store_interleave_group(ptr %src, ptr %dst) {
; CHECK-LABEL: @pr58722_store_interleave_group(
; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 2
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC:%.*]], i32 [[TMP0]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP1]]
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[TMP2]], align 4
+; CHECK-NEXT: store i32 [[TMP1]], ptr [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 1
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i64 1
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP0]] to i24
+; CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP1]] to i24
+; CHECK-NEXT: store i24 [[TMP6]], ptr [[TMP4]], align 4
+; CHECK-NEXT: store i24 [[TMP7]], ptr [[TMP5]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 5000
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[GEP_IV:%.*]] = getelementptr inbounds i64, ptr [[SRC:%.*]], i32 [[IV]]
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[GEP_IV:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[IV]]
; CHECK-NEXT: store i32 [[IV]], ptr [[GEP_IV]], align 4
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[GEP_IV]], i64 1
; CHECK-NEXT: [[TRUNC_IV:%.*]] = trunc i32 [[IV]] to i24
; CHECK-NEXT: store i24 [[TRUNC_IV]], ptr [[GEP]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 2
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV]], 10000
-; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT: br i1 [[CMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
More information about the llvm-commits
mailing list