[llvm] [AMDGPU] Optimize LDS DMA soft waitcnt (PR #138802)
Austin Kerbow via llvm-commits
llvm-commits at lists.llvm.org
Thu May 29 12:28:40 PDT 2025
https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/138802
>From 551d49cdb677b09d44f1b253de87de32fefa94d3 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Tue, 6 May 2025 22:07:30 -0700
Subject: [PATCH] [AMDGPU] Optimize LDS DMA soft waitcnt
This patch adds support for optimizing `S_WAITCNT_VMCNT_LDS_DMA_soft`
pseudo instructions by analyzing whether they can be removed based on
the absence of LDS DMA operations.
These optimizations are a precursor to a dependent patch where these
waitcnt pseudos will actually be emitted by the memory legalizer. Adding
the waitcnt in the memory model first without any optimization would be
too painful of a performance penalty.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 17 ++
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 1 +
llvm/lib/Target/AMDGPU/SOPInstructions.td | 6 +
llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir | 170 +++++++++++++++++++
4 files changed, 194 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ca8e3244edd15..674715e60de25 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1278,6 +1278,23 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
if (Opcode == AMDGPU::S_WAITCNT) {
unsigned IEnc = II.getOperand(0).getImm();
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+
+ // These pseudo waitcnt instructions are only needed to synchronize DS
+ // operations with direct LDS loads that use vmcnt. We can safely relax
+ // them when no outstanding direct LDS loads exist, even if other vmcnt
+ // events are pending.
+ if (II.getOpcode() == AMDGPU::S_WAITCNT_DIRECT_LDS_LOAD_soft &&
+ TrySimplify) {
+ unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+ AMDGPU::Waitcnt LDSDirectWait;
+ ScoreBrackets.determineWait(LOAD_CNT, RegNo, LDSDirectWait);
+ // Relax waitcnt to only wait on inflight direct LDS loads.
+ if (LDSDirectWait.LoadCnt > OldWait.LoadCnt) {
+ OldWait.LoadCnt = LDSDirectWait.LoadCnt;
+ Modified = true;
+ }
+ }
+
if (TrySimplify)
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 64ab064a75f44..1465fed5315e8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1010,6 +1010,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
switch (Opcode) {
case AMDGPU::S_WAITCNT_soft:
+ case AMDGPU::S_WAITCNT_DIRECT_LDS_LOAD_soft:
return AMDGPU::S_WAITCNT;
case AMDGPU::S_WAITCNT_VSCNT_soft:
return AMDGPU::S_WAITCNT_VSCNT;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 40b3dfb94ce2f..46a641704bfeb 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1608,6 +1608,12 @@ let OtherPredicates = [HasImageInsts] in {
def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}
+// Soft waitcnt for direct loads to LDS from global memory. These waits may be
+// relaxed or removed entirely based on current in-flight memory operations
+// and their relation to these direct LDS loads. For example, if global loads
+// to LDS are mixed with global loads not writing to LDS, a wait may only be
+// necessary for the LDS-writing loads to synchronize with other LDS operations.
+def S_WAITCNT_DIRECT_LDS_LOAD_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">;
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
index 21372c06d3223..6c2854b67b134 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
@@ -117,3 +117,173 @@ body: |
S_ENDPGM 0
...
+
+# Soft waitcnt should be honored here.
+# GCN-LABEL: name: buffer_load_dword_lds_ds_read_soft_wait
+# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_WAITCNT 3952
+# vmcnt(0)
+# GCN-NEXT: S_BARRIER
+---
+name: buffer_load_dword_lds_ds_read_soft_wait
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
+ S_BARRIER
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
+
+# No need for waitcnt.
+# GCN-LABEL: name: buffer_store_lds_dword_ds_read_soft_wait
+# GCN: BUFFER_STORE_LDS_DWORD
+# GCN-NEXT: S_BARRIER
+---
+name: buffer_store_lds_dword_ds_read_soft_wait
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_STORE_LDS_DWORD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(3) poison` + 4), (store (s32) into `ptr addrspace(1) poison` + 4)
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
+ S_BARRIER
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
+
+# Soft waitcnt should mean vmcnt(1) before the barrier and vmcnt(0) after.
+# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait
+# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_WAITCNT 3953
+# vmcnt(1)
+# GCN-NEXT: S_BARRIER
+# GCN-NEXT: S_WAITCNT 3952
+# vmcnt(0)
+# GCN-NEXT: DS_READ_B32_gfx9
+---
+name: series_of_buffer_load_dword_lds_ds_read_soft_wait
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3953
+ S_BARRIER
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
+
+# No waitcnt before the barrier because counter is too high
+# GCN-LABEL: name: buffer_load_dword_lds_ds_read_soft_wait_redundant
+# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_BARRIER
+# GCN-NEXT: S_WAITCNT 3952
+# vmcnt(0)
+# GCN-NEXT: DS_READ_B32_gfx9
+---
+name: buffer_load_dword_lds_ds_read_soft_wait_redundant
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3953
+ S_BARRIER
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
+
+# Combine waitcnt.
+# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_repeat
+# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_WAITCNT 3953
+# vmcnt(1)
+# GCN-NEXT: S_BARRIER
+# GCN-NEXT: S_WAITCNT 3952
+# vmcnt(0)
+# GCN-NEXT: DS_READ_B32_gfx9
+---
+name: series_of_buffer_load_dword_lds_ds_read_soft_wait_repeat
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3953
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3953
+ S_BARRIER
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3953
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3953
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
+
+# Merge waitcnt.
+# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_merge
+# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_WAITCNT 3953
+# vmcnt(1)
+# GCN-NEXT: S_BARRIER
+# GCN-NEXT: S_WAITCNT 3952
+# vmcnt(0)
+# GCN-NEXT: DS_READ_B32_gfx9
+---
+name: series_of_buffer_load_dword_lds_ds_read_soft_wait_merge
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3954
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3953
+ S_BARRIER
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
+
+
+# Handle the preexisting waitcnt.
+# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_preexisting
+# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_WAITCNT 0
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_BARRIER
+# GCN-NEXT: S_WAITCNT 3952
+# vmcnt(0)
+# GCN-NEXT: DS_READ_B32_gfx9
+---
+name: series_of_buffer_load_dword_lds_ds_read_soft_wait_preexisting
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ S_WAITCNT 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3953
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3953
+ S_BARRIER
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3953
+ S_WAITCNT_DIRECT_LDS_LOAD_soft 3953
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ S_ENDPGM 0
+
+...
More information about the llvm-commits
mailing list