[llvm] e75f586 - [AMDGPU] Relax lds dma waitcnt with no aliasing pair (#131842)

via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 24 10:38:51 PDT 2025


Author: Austin Kerbow
Date: 2025-03-24T10:38:47-07:00
New Revision: e75f586b813a081cffcafb8b5d34b5547e52e548

URL: https://github.com/llvm/llvm-project/commit/e75f586b813a081cffcafb8b5d34b5547e52e548
DIFF: https://github.com/llvm/llvm-project/commit/e75f586b813a081cffcafb8b5d34b5547e52e548.diff

LOG: [AMDGPU] Relax lds dma waitcnt with no aliasing pair (#131842)

If we cannot find any lds DMA instruction that is aliased by some load
from lds, we will still insert vmcnt(0). This is overly cautious since
handling inter-thread dependences is normally managed by the memory
model instead of the waitcnt pass, so this change updates the behavior
to be more inline with how other types of memory events are handled.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0edacadc4884f..15965f2bac8aa 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1768,7 +1768,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
 
         // LOAD_CNT is only relevant to vgpr or LDS.
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-        bool FoundAliasingStore = false;
         // Only objects with alias scope info were added to LDSDMAScopes array.
         // In the absense of the scope info we will not be able to disambiguate
         // aliasing here. There is no need to try searching for a corresponding
@@ -1778,14 +1777,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
           const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
           for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
-            if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
-              FoundAliasingStore = true;
+            if (MI.mayAlias(AA, *LDSDMAStores[I], true))
               ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
-            }
           }
-        }
-        if (!FoundAliasingStore)
+        } else {
           ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
+        }
         if (Memop->isStore()) {
           ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
         }

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
index 3cf02be69d3fe..e4e40159e185d 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
@@ -67,7 +67,6 @@ main_body:
 }
 
 ; There are 8 pseudo registers defined to track LDS DMA dependencies.
-; When exhausted we default to vmcnt(0).
 
 ; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
 ; GCN-COUNT-10: buffer_load_dword
@@ -86,7 +85,6 @@ main_body:
 ; GCN: s_waitcnt vmcnt(2)
 ; GCN-NOT: s_waitcnt vmcnt
 ; GCN: ds_read_b32
-; GCN: s_waitcnt vmcnt(0)
 ; GCN: ds_read_b32
 define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
 main_body:
@@ -151,4 +149,29 @@ main_body:
   ret void
 }
 
+define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
+; GFX9-LABEL: global_load_lds_no_alias_ds_read:
+; GFX9: global_load_dword
+; GFX9: global_load_dword
+; GFX9: s_waitcnt vmcnt(1)
+; GFX9-NOT: s_waitcnt vmcnt(0)
+; GFX9: ds_read_b32
+; GFX9: s_waitcnt vmcnt(0)
+; GFX9: ds_read_b32
+; GFX9: s_endpgm
+body:
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0)
+  call void @llvm.amdgcn.s.waitcnt(i32 3953)
+  %gep.0 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i1
+  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+  call void @llvm.amdgcn.s.waitcnt(i32 3952)
+  %gep.1 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
+  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+  %tmp = insertelement <2 x float> poison, float %val.0, i32 0
+  %res = insertelement <2 x float> %tmp, float %val.1, i32 1
+  store <2 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
 declare void @llvm.amdgcn.wave.barrier()


        


More information about the llvm-commits mailing list