[llvm] [AMDGPU] Relax lds dma waitcnt with no aliasing pair (PR #131842)

Austin Kerbow via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 21 20:46:53 PDT 2025


https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/131842

>From cfd1d488e05296fa4651b7bd5a880fb56fd2bdec Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Fri, 21 Mar 2025 19:41:04 -0700
Subject: [PATCH] [AMDGPU] Relax lds dma waitcnt with no aliasing pair

If we cannot find any lds DMA instruction that is aliased by some load
from lds, we will still insert vmcnt(0). This is overly cautious since
handling inter-thread dependences is normally managed by the memory
model instead of the waitcnt pass, so this change updates the behavior
to be more inline with how other types of memory events are handled.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  6 ++---
 llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll   | 27 +++++++++++++++++++--
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index eafa324a04b00..943b22fa830ed 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1757,7 +1757,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
 
         // LOAD_CNT is only relevant to vgpr or LDS.
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-        bool FoundAliasingStore = false;
         // Only objects with alias scope info were added to LDSDMAScopes array.
         // In the absense of the scope info we will not be able to disambiguate
         // aliasing here. There is no need to try searching for a corresponding
@@ -1768,13 +1767,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
           for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
             if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
-              FoundAliasingStore = true;
               ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
             }
           }
-        }
-        if (!FoundAliasingStore)
+        } else {
           ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
+        }
         if (Memop->isStore()) {
           ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
         }
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
index 3cf02be69d3fe..e4e40159e185d 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
@@ -67,7 +67,6 @@ main_body:
 }
 
 ; There are 8 pseudo registers defined to track LDS DMA dependencies.
-; When exhausted we default to vmcnt(0).
 
 ; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
 ; GCN-COUNT-10: buffer_load_dword
@@ -86,7 +85,6 @@ main_body:
 ; GCN: s_waitcnt vmcnt(2)
 ; GCN-NOT: s_waitcnt vmcnt
 ; GCN: ds_read_b32
-; GCN: s_waitcnt vmcnt(0)
 ; GCN: ds_read_b32
 define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
 main_body:
@@ -151,4 +149,29 @@ main_body:
   ret void
 }
 
+define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
+; GFX9-LABEL: global_load_lds_no_alias_ds_read:
+; GFX9: global_load_dword
+; GFX9: global_load_dword
+; GFX9: s_waitcnt vmcnt(1)
+; GFX9-NOT: s_waitcnt vmcnt(0)
+; GFX9: ds_read_b32
+; GFX9: s_waitcnt vmcnt(0)
+; GFX9: ds_read_b32
+; GFX9: s_endpgm
+body:
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 3953)
+  %gep.0 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i1
+  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+  call void @llvm.amdgcn.s.waitcnt(i32 3952)
+  %gep.1 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
+  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+  %tmp = insertelement <2 x float> poison, float %val.0, i32 0
+  %res = insertelement <2 x float> %tmp, float %val.1, i32 1
+  store <2 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
 declare void @llvm.amdgcn.wave.barrier()



More information about the llvm-commits mailing list