[llvm] [SLP] Invariant loads cannot have a memory dependency on stores. (PR #167929)
David Stuttard via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 17 08:02:49 PST 2025
https://github.com/dstutt updated https://github.com/llvm/llvm-project/pull/167929
>From 513c98602fe4cbed34eddb5d2cbfbf242ad52e24 Mon Sep 17 00:00:00 2001
From: Michael Bedy <Michael.Bedy at amd.com>
Date: Thu, 13 Nov 2025 13:49:14 -0500
Subject: [PATCH 1/3] [SLP] Invariant loads cannot have a memory dependency on
stores.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 19 ++++-
.../AMDGPU/invariant-load-no-alias-store.ll | 82 +++++++++++++++++++
2 files changed, 100 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cc53b0dd3577e..e67e607ee1632 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21623,6 +21623,17 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
}
}
+ // Helper to detect loads marked with !invariant.load metadata. Such loads
+ // are defined to read from memory that never changes for the lifetime of
+ // the program; any store to the same location would be UB. Therefore we
+ // can conservatively treat an invariant load and any store as non-aliasing
+ // for scheduling/dep purposes and skip creating a dependency edge.
+ auto IsInvariantLoad = [](const Instruction *I) {
+ if (const auto *LI = dyn_cast<LoadInst>(I))
+ return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
+ return false;
+ };
+
// Handle the memory dependencies (if any).
ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
if (!NextLoadStore)
@@ -21636,10 +21647,15 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
unsigned DistToSrc = 1;
bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
+ if (IsInvariantLoad(SrcInst))
+ return; // Invariant load cannot have memory dependencies.
+
for (ScheduleData *DepDest = NextLoadStore; DepDest;
DepDest = DepDest->getNextLoadStore()) {
assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
+ Instruction *DestInst = DepDest->getInst();
+
// We have two limits to reduce the complexity:
// 1) AliasedCheckLimit: It's a small limit to reduce calls to
// SLP->isAliased (which is the expensive part in this loop).
@@ -21648,7 +21664,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
// It's important for the loop break condition (see below) to
// check this limit even between two read-only instructions.
if (DistToSrc >= MaxMemDepDistance ||
- ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
+ (!IsInvariantLoad(DestInst) && // Cannot have memory deps.
+ (SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
(IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll
new file mode 100644
index 0000000000000..00bd0feb8c0fa
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes="function(slp-vectorizer)" -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 %s -S | FileCheck %s
+
+define void @test(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0
+; CHECK-NEXT: [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0
+; CHECK-NEXT: [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[A0PTR]], align 2, !invariant.load [[META0:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[B0PTR]], align 2, !invariant.load [[META0]]
+; CHECK-NEXT: [[TMP2:%.*]] = fadd reassoc <2 x half> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[P0]], align 2
+; CHECK-NEXT: ret void
+;
+entry:
+ %p0 = getelementptr half, ptr addrspace(1) %base, i32 0
+ %p1 = getelementptr half, ptr addrspace(1) %base, i32 1
+ ; First pair of invariant loads from otherA.
+ %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0
+ %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0
+ %A0 = load half, ptr addrspace(1) %A0PTR, align 2, !invariant.load !0
+ %B0 = load half, ptr addrspace(1) %B0PTR, align 2, !invariant.load !0
+ %add0 = fadd reassoc half %A0, %B0
+ store half %add0, ptr addrspace(1) %p0, align 2
+ %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1
+ %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1
+ %A1 = load half, ptr addrspace(1) %A1PTR, align 2, !invariant.load !0
+ %B1 = load half, ptr addrspace(1) %B1PTR, align 2, !invariant.load !0
+ %add1 = fadd reassoc half %A1, %B1
+ store half %add1, ptr addrspace(1) %p1, align 2
+ ret void
+}
+
+
+define void @aliastest(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 {
+; CHECK-LABEL: define void @aliastest(
+; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0
+; CHECK-NEXT: [[P1:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 1
+; CHECK-NEXT: [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0
+; CHECK-NEXT: [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0
+; CHECK-NEXT: [[A0:%.*]] = load half, ptr addrspace(1) [[A0PTR]], align 2
+; CHECK-NEXT: [[B0:%.*]] = load half, ptr addrspace(1) [[B0PTR]], align 2
+; CHECK-NEXT: [[ADD0:%.*]] = fadd reassoc half [[A0]], [[B0]]
+; CHECK-NEXT: store half [[ADD0]], ptr addrspace(1) [[P0]], align 2
+; CHECK-NEXT: [[A1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 1
+; CHECK-NEXT: [[B1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 1
+; CHECK-NEXT: [[A1:%.*]] = load half, ptr addrspace(1) [[A1PTR]], align 2
+; CHECK-NEXT: [[B1:%.*]] = load half, ptr addrspace(1) [[B1PTR]], align 2
+; CHECK-NEXT: [[ADD1:%.*]] = fadd reassoc half [[A1]], [[B1]]
+; CHECK-NEXT: store half [[ADD1]], ptr addrspace(1) [[P1]], align 2
+; CHECK-NEXT: ret void
+;
+entry:
+ %p0 = getelementptr half, ptr addrspace(1) %base, i32 0
+ %p1 = getelementptr half, ptr addrspace(1) %base, i32 1
+ ; First pair of invariant loads from otherA.
+ %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0
+ %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0
+ %A0 = load half, ptr addrspace(1) %A0PTR, align 2
+ %B0 = load half, ptr addrspace(1) %B0PTR, align 2
+ %add0 = fadd reassoc half %A0, %B0
+ store half %add0, ptr addrspace(1) %p0, align 2
+ %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1
+ %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1
+ %A1 = load half, ptr addrspace(1) %A1PTR, align 2
+ %B1 = load half, ptr addrspace(1) %B1PTR, align 2
+ %add1 = fadd reassoc half %A1, %B1
+ store half %add1, ptr addrspace(1) %p1, align 2
+ ret void
+}
+
+
+attributes #0 = { nounwind }
+
+!0 = !{}
+;.
+; CHECK: [[META0]] = !{}
+;.
>From 4d014b8e1e3f004248eaefbd50353a4147bcd4cb Mon Sep 17 00:00:00 2001
From: Michael Bedy <Michael.Bedy at amd.com>
Date: Thu, 13 Nov 2025 22:10:06 -0500
Subject: [PATCH 2/3] Update location; exclude invariant (and simple) loads
from memory inst scheduler data list. Add volatile test.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 30 ++++++--------
.../AMDGPU/invariant-load-no-alias-store.ll | 39 +++++++++++++++++++
2 files changed, 51 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e67e607ee1632..8cf5f9e133b6e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21412,7 +21412,18 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
"new ScheduleData already in scheduling region");
SD->init(SchedulingRegionID, I);
+ auto CanIgnoreLoad = [](const Instruction *I) {
+ const LoadInst *LI = dyn_cast<LoadInst>(I);
+ // If there is a simple load marked as invariant, we can ignore it.
+ // But, in the (unlikely) case of non-simple invariant load,
+ // we should not ignore it.
+ return LI && LI->isSimple() &&
+ LI->getMetadata(LLVMContext::MD_invariant_load);
+ };
+
if (I->mayReadOrWriteMemory() &&
+ // Simple InvariantLoad does not depend on other memory accesses.
+ !CanIgnoreLoad(I) &&
(!isa<IntrinsicInst>(I) ||
(cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
cast<IntrinsicInst>(I)->getIntrinsicID() !=
@@ -21623,17 +21634,6 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
}
}
- // Helper to detect loads marked with !invariant.load metadata. Such loads
- // are defined to read from memory that never changes for the lifetime of
- // the program; any store to the same location would be UB. Therefore we
- // can conservatively treat an invariant load and any store as non-aliasing
- // for scheduling/dep purposes and skip creating a dependency edge.
- auto IsInvariantLoad = [](const Instruction *I) {
- if (const auto *LI = dyn_cast<LoadInst>(I))
- return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
- return false;
- };
-
// Handle the memory dependencies (if any).
ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
if (!NextLoadStore)
@@ -21647,15 +21647,10 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
unsigned DistToSrc = 1;
bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(SrcInst);
- if (IsInvariantLoad(SrcInst))
- return; // Invariant load cannot have memory dependencies.
-
for (ScheduleData *DepDest = NextLoadStore; DepDest;
DepDest = DepDest->getNextLoadStore()) {
assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
- Instruction *DestInst = DepDest->getInst();
-
// We have two limits to reduce the complexity:
// 1) AliasedCheckLimit: It's a small limit to reduce calls to
// SLP->isAliased (which is the expensive part in this loop).
@@ -21664,8 +21659,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
// It's important for the loop break condition (see below) to
// check this limit even between two read-only instructions.
if (DistToSrc >= MaxMemDepDistance ||
- (!IsInvariantLoad(DestInst) && // Cannot have memory deps.
- (SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
+ ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
(IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) {
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll
index 00bd0feb8c0fa..87537c05573ae 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/invariant-load-no-alias-store.ll
@@ -73,6 +73,45 @@ entry:
ret void
}
+define void @voltest(ptr addrspace(1) %base, ptr addrspace(1) %otherA, ptr addrspace(1) %otherB) #0 {
+; CHECK-LABEL: define void @voltest(
+; CHECK-SAME: ptr addrspace(1) [[BASE:%.*]], ptr addrspace(1) [[OTHERA:%.*]], ptr addrspace(1) [[OTHERB:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[P0:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 0
+; CHECK-NEXT: [[P1:%.*]] = getelementptr half, ptr addrspace(1) [[BASE]], i32 1
+; CHECK-NEXT: [[A0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 0
+; CHECK-NEXT: [[B0PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 0
+; CHECK-NEXT: [[A0:%.*]] = load volatile half, ptr addrspace(1) [[A0PTR]], align 2, !invariant.load [[META0]]
+; CHECK-NEXT: [[B0:%.*]] = load volatile half, ptr addrspace(1) [[B0PTR]], align 2, !invariant.load [[META0]]
+; CHECK-NEXT: [[ADD0:%.*]] = fadd reassoc half [[A0]], [[B0]]
+; CHECK-NEXT: store half [[ADD0]], ptr addrspace(1) [[P0]], align 2
+; CHECK-NEXT: [[A1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERA]], i32 1
+; CHECK-NEXT: [[B1PTR:%.*]] = getelementptr half, ptr addrspace(1) [[OTHERB]], i32 1
+; CHECK-NEXT: [[A1:%.*]] = load volatile half, ptr addrspace(1) [[A1PTR]], align 2, !invariant.load [[META0]]
+; CHECK-NEXT: [[B1:%.*]] = load volatile half, ptr addrspace(1) [[B1PTR]], align 2, !invariant.load [[META0]]
+; CHECK-NEXT: [[ADD1:%.*]] = fadd reassoc half [[A1]], [[B1]]
+; CHECK-NEXT: store half [[ADD1]], ptr addrspace(1) [[P1]], align 2
+; CHECK-NEXT: ret void
+;
+entry:
+ %p0 = getelementptr half, ptr addrspace(1) %base, i32 0
+ %p1 = getelementptr half, ptr addrspace(1) %base, i32 1
+ ; First pair of invariant loads from otherA.
+ %A0PTR = getelementptr half, ptr addrspace(1) %otherA, i32 0
+ %B0PTR = getelementptr half, ptr addrspace(1) %otherB, i32 0
+ %A0 = load volatile half, ptr addrspace(1) %A0PTR, align 2, !invariant.load !0
+ %B0 = load volatile half, ptr addrspace(1) %B0PTR, align 2, !invariant.load !0
+ %add0 = fadd reassoc half %A0, %B0
+ store half %add0, ptr addrspace(1) %p0, align 2
+ %A1PTR = getelementptr half, ptr addrspace(1) %otherA, i32 1
+ %B1PTR = getelementptr half, ptr addrspace(1) %otherB, i32 1
+ %A1 = load volatile half, ptr addrspace(1) %A1PTR, align 2, !invariant.load !0
+ %B1 = load volatile half, ptr addrspace(1) %B1PTR, align 2, !invariant.load !0
+ %add1 = fadd reassoc half %A1, %B1
+ store half %add1, ptr addrspace(1) %p1, align 2
+ ret void
+}
+
attributes #0 = { nounwind }
>From d6fdd3d2f09812d3d2146b400da7bf6eb5804d96 Mon Sep 17 00:00:00 2001
From: Michael Bedy <Michael.Bedy at amd.com>
Date: Fri, 14 Nov 2025 14:38:50 -0500
Subject: [PATCH 3/3] Fix nit.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8cf5f9e133b6e..8231f5234eac2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21413,7 +21413,7 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
SD->init(SchedulingRegionID, I);
auto CanIgnoreLoad = [](const Instruction *I) {
- const LoadInst *LI = dyn_cast<LoadInst>(I);
+ const auto *LI = dyn_cast<LoadInst>(I);
// If there is a simple load marked as invariant, we can ignore it.
// But, in the (unlikely) case of non-simple invariant load,
// we should not ignore it.
More information about the llvm-commits
mailing list