[llvm] [AMDGPU] Remove scope check in SIInsertWaitcnts::generateWaitcntInstBefore (PR #157821)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 10 03:22:01 PDT 2025
https://github.com/choikwa updated https://github.com/llvm/llvm-project/pull/157821
>From d36ff252b1dc2b8590e610214c0f241474f150b8 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 10 Sep 2025 01:00:15 -0500
Subject: [PATCH 1/2] [AMDGPU] Remove scope check in
SIInsertWaitcnts::generateWaitcntInstBefore
This change was motivated by CK where many VMCNT(0)'s were generated due
to instructions lacking !alias.scope metadata. The two causes of this were
1) LowerLDSModule not tacking on scope metadata on a single LDS variable
2) IPSCCP pass before inliner replacing noalias ptr derivative with a
global value, which made inliner unable to track it back to the noalias
ptr argument.
However, it turns out that IPSCCP losing the scope information was largely
ineffectual as ScopedNoAliasAA was able to handle asymmetric condition,
where one MemLoc was missing scope, and still return NoAlias result.
AMDGPU however was checking for existence of scope in SIInsertWaitcnts and
conservatively treating it as aliasing all and inserted VMCNT(0) before
DS_READs, forcing it to wait for all previous LDS DMA instructions.
Since we know that ScopedNoAliasAA can handle asymmetry, we should also
allow AA query to determine if two MIs may alias.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 10 ++-
llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll | 65 ++++++++++++++++++++
2 files changed, 69 insertions(+), 6 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index e3a2efdd3856f..68e7bdc2b5ca8 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1943,12 +1943,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// LOAD_CNT is only relevant to vgpr or LDS.
unsigned RegNo = FIRST_LDS_VGPR;
// Only objects with alias scope info were added to LDSDMAScopes array.
- // In the absense of the scope info we will not be able to disambiguate
- // aliasing here. There is no need to try searching for a corresponding
- // store slot. This is conservatively correct because in that case we
- // will produce a wait using the first (general) LDS DMA wait slot which
- // will wait on all of them anyway.
- if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
+ // AliasAnalysis query can determine aliasing even if Memop's Scope is
+ // missing. ScopedNoAlias allows for alias query on MemLoc without a
+ // scope.
+ if (Ptr && Memop->getAAInfo()) {
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
if (MI.mayAlias(AA, *LDSDMAStores[I], true))
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
new file mode 100644
index 0000000000000..6d24de85a8ad8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
@@ -0,0 +1,65 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=CHECK %s
+
+declare void @llvm.amdgcn.sched.barrier(i32 %mask) #0
+declare void @llvm.amdgcn.load.to.lds(ptr %in, ptr addrspace(3) %lds_out, i32 %size, i32 %offset, i32 %aux) #0
+
+define amdgpu_kernel void @test_waitcnt(ptr addrspace(1) %global_buffer, ptr addrspace(3) %lds_buffer1, ptr addrspace(3) %lds_buffer2) {
+; This test checks if SIInsertWaitcnts pass inserts S_WAITCNT VMCNT(0) before DS_READ
+; CHECK-NOT: s_waitcnt vmcnt(0)
+; CHECK: ds_read_b32
+entry:
+ ; VMEM accesses with alias.scope
+ %vmem_load = load i32, ptr addrspace(1) %global_buffer
+ %gepvmem = getelementptr i32, ptr addrspace(1) %global_buffer, i32 16
+ store i32 %vmem_load, ptr addrspace(1) %gepvmem, align 4, !alias.scope !0
+
+ ; Global to LDS load
+ %gepvmem.ascast = addrspacecast ptr addrspace(1) %gepvmem to ptr
+ call void @llvm.amdgcn.load.to.lds(ptr %gepvmem.ascast, ptr addrspace(3) %lds_buffer1, i32 4, i32 4, i32 0), !alias.scope !9, !noalias !14
+
+ ; Insert scheduling barrier
+ call void @llvm.amdgcn.sched.barrier(i32 0)
+
+ ; DS_WRITEs with alias.scope and noalias
+ store i32 %vmem_load, ptr addrspace(3) %lds_buffer1, align 4, !alias.scope !1, !noalias !12
+ store i32 %vmem_load, ptr addrspace(3) %lds_buffer2, align 4, !alias.scope !6, !noalias !13
+
+ ; Insert scheduling barrier
+ call void @llvm.amdgcn.sched.barrier(i32 0)
+
+ ; DS_READ with alias.scope missing
+ %lds_load = load i32, ptr addrspace(3) %lds_buffer1, align 4, !noalias !12
+
+ ; VMEM write
+ %gep = getelementptr i32, ptr addrspace(1) %global_buffer, i32 4
+ %gep2 = getelementptr i32, ptr addrspace(1) %global_buffer, i32 8
+ store i32 %lds_load, ptr addrspace(1) %gep, align 4, !alias.scope !0
+ store i32 %vmem_load, ptr addrspace(1) %gep2, align 4, !alias.scope !0
+
+ ret void
+}
+
+; VMEM alias domain and scope
+!5 = !{!"vmem.domain"}
+!4 = !{!"vmem.scope", !5}
+!0 = !{!4}
+
+; LDS alias domains and scopes
+!3 = !{!"lds1.domain"}
+!2 = !{!"lds1.scope", !3}
+!1 = !{!2}
+
+!8 = !{!"lds2.domain"}
+!7 = !{!"lds2.scope", !8}
+!6 = !{!7}
+
+!11 = !{!"lds1_off4.domain"}
+!10 = !{!"lds1_off4.scope", !11}
+!9 = !{!10}
+
+; Noalias lists
+!12 = !{!7, !10}
+!13 = !{!2, !10}
+!14 = !{!2, !7}
+
+attributes #0 = { nounwind }
>From 1855ede257f64300e1d940d6915efb9fa2630fd9 Mon Sep 17 00:00:00 2001
From: Kevin Choi <kevin.choi at amd.com>
Date: Wed, 10 Sep 2025 05:20:33 -0500
Subject: [PATCH 2/2] Remove confusing comments, redundant check-prefix, move
function attribute
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 4 ----
llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll | 8 ++++----
2 files changed, 4 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 68e7bdc2b5ca8..3c9f25765192a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1942,10 +1942,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// LOAD_CNT is only relevant to vgpr or LDS.
unsigned RegNo = FIRST_LDS_VGPR;
- // Only objects with alias scope info were added to LDSDMAScopes array.
- // AliasAnalysis query can determine aliasing even if Memop's Scope is
- // missing. ScopedNoAlias allows for alias query on MemLoc without a
- // scope.
if (Ptr && Memop->getAAInfo()) {
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
index 6d24de85a8ad8..e24c9500f30de 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-unscoped.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx950 < %s | FileCheck %s
-declare void @llvm.amdgcn.sched.barrier(i32 %mask) #0
-declare void @llvm.amdgcn.load.to.lds(ptr %in, ptr addrspace(3) %lds_out, i32 %size, i32 %offset, i32 %aux) #0
+declare void @llvm.amdgcn.sched.barrier(i32 %mask)
+declare void @llvm.amdgcn.load.to.lds(ptr %in, ptr addrspace(3) %lds_out, i32 %size, i32 %offset, i32 %aux)
-define amdgpu_kernel void @test_waitcnt(ptr addrspace(1) %global_buffer, ptr addrspace(3) %lds_buffer1, ptr addrspace(3) %lds_buffer2) {
+define amdgpu_kernel void @test_waitcnt(ptr addrspace(1) %global_buffer, ptr addrspace(3) %lds_buffer1, ptr addrspace(3) %lds_buffer2) #0 {
; This test checks if SIInsertWaitcnts pass inserts S_WAITCNT VMCNT(0) before DS_READ
; CHECK-NOT: s_waitcnt vmcnt(0)
; CHECK: ds_read_b32
More information about the llvm-commits
mailing list