[llvm] [AMDGPU] Optimize LDS DMA soft waitcnt (PR #138802)
via llvm-commits
llvm-commits at lists.llvm.org
Tue May 6 22:11:14 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Austin Kerbow (kerbowa)
<details>
<summary>Changes</summary>
This patch adds support for optimizing `S_WAITCNT_VMCNT_LDS_DMA_soft`
pseudo instructions by analyzing whether they can be removed based on
the absence of LDS DMA operations.
These optimizations are a precursor to a dependent patch where these
waitcnt pseudos will actually be emitted by the memory legalizer. Adding
the waitcnt in the memory model first without any optimization would be
too painful of a performance penalty.
---
Full diff: https://github.com/llvm/llvm-project/pull/138802.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+12)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+1)
- (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+1)
- (modified) llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir (+170)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6f5083acd738d..3e0cf5f35318b 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1278,6 +1278,18 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
if (Opcode == AMDGPU::S_WAITCNT) {
unsigned IEnc = II.getOperand(0).getImm();
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+
+ // These pseudo waitcnt instructions are only needed to synchronize DS operations
+ // with direct LDS loads that use vmcnt. We can safely relax them when no
+ // outstanding direct LDS loads exist, even if other vmcnt events are pending.
+ if (II.getOpcode() == AMDGPU::S_WAITCNT_VMCNT_LDS_DMA_soft) {
+ unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+ AMDGPU::Waitcnt LDSDMAWait;
+ ScoreBrackets.determineWait(LOAD_CNT, RegNo, LDSDMAWait);
+ if (LDSDMAWait.LoadCnt == ~0u)
+ OldWait.LoadCnt = ~0u;
+ }
+
if (TrySimplify)
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 4b97f58ce92b9..8f32c5223266f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1010,6 +1010,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
switch (Opcode) {
case AMDGPU::S_WAITCNT_soft:
+ case AMDGPU:: S_WAITCNT_VMCNT_LDS_DMA_soft:
return AMDGPU::S_WAITCNT;
case AMDGPU::S_WAITCNT_VSCNT_soft:
return AMDGPU::S_WAITCNT_VSCNT;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 3d3f1ba3f5170..a6a080a3574db 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1608,6 +1608,7 @@ let OtherPredicates = [HasImageInsts] in {
def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}
+def S_WAITCNT_VMCNT_LDS_DMA_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">;
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
index 21372c06d3223..71e7a9f29689d 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
@@ -117,3 +117,173 @@ body: |
S_ENDPGM 0
...
+
+# Soft waitcnt should be honored here.
+# GCN-LABEL: name: buffer_load_dword_lds_ds_read_soft_wait
+# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_WAITCNT 3952
+# vmcnt(0)
+# GCN-NEXT: S_BARRIER
+---
+name: buffer_load_dword_lds_ds_read_soft_wait
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3952
+ S_BARRIER
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
+
+# No need for waitcnt.
+# GCN-LABEL: name: buffer_store_lds_dword_ds_read_soft_wait
+# GCN: BUFFER_STORE_LDS_DWORD
+# GCN-NEXT: S_BARRIER
+---
+name: buffer_store_lds_dword_ds_read_soft_wait
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_STORE_LDS_DWORD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(3) poison` + 4), (store (s32) into `ptr addrspace(1) poison` + 4)
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3952
+ S_BARRIER
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
+
+# Soft waitcnt should mean vmcnt(1) before the barrier and vmcnt(0) after.
+# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait
+# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_WAITCNT 3953
+# vmcnt(1)
+# GCN-NEXT: S_BARRIER
+# GCN-NEXT: S_WAITCNT 3952
+# vmcnt(0)
+# GCN-NEXT: DS_READ_B32_gfx9
+---
+name: series_of_buffer_load_dword_lds_ds_read_soft_wait
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3953
+ S_BARRIER
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
+
+# No waitcnt before the barrier because counter is too high
+# GCN-LABEL: name: buffer_load_dword_lds_ds_read_soft_wait_redundant
+# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_BARRIER
+# GCN-NEXT: S_WAITCNT 3952
+# vmcnt(0)
+# GCN-NEXT: DS_READ_B32_gfx9
+---
+name: buffer_load_dword_lds_ds_read_soft_wait_redundant
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3953
+ S_BARRIER
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
+
+# Combine waitcnt.
+# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_repeat
+# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_WAITCNT 3953
+# vmcnt(1)
+# GCN-NEXT: S_BARRIER
+# GCN-NEXT: S_WAITCNT 3952
+# vmcnt(0)
+# GCN-NEXT: DS_READ_B32_gfx9
+---
+name: series_of_buffer_load_dword_lds_ds_read_soft_wait_repeat
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3953
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3953
+ S_BARRIER
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3953
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3953
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
+
+# Merge waitcnt.
+# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_merge
+# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_WAITCNT 3953
+# vmcnt(1)
+# GCN-NEXT: S_BARRIER
+# GCN-NEXT: S_WAITCNT 3952
+# vmcnt(0)
+# GCN-NEXT: DS_READ_B32_gfx9
+---
+name: series_of_buffer_load_dword_lds_ds_read_soft_wait_merge
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3954
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3953
+ S_BARRIER
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3952
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3952
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
+
+
+# Handle the preexisting waitcnt.
+# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_preexisting
+# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_WAITCNT 0
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_BARRIER
+# GCN-NEXT: S_WAITCNT 3952
+# vmcnt(0)
+# GCN-NEXT: DS_READ_B32_gfx9
+---
+name: series_of_buffer_load_dword_lds_ds_read_soft_wait_preexisting
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ S_WAITCNT 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3953
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3953
+ S_BARRIER
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3953
+ S_WAITCNT_VMCNT_LDS_DMA_soft 3953
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
``````````
</details>
https://github.com/llvm/llvm-project/pull/138802
More information about the llvm-commits
mailing list