[llvm] [AMDGPU] introduce S_WAITCNT_LDS_DIRECT in the memory legalizer (PR #150887)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 27 23:34:53 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Sameer Sahasrabuddhe (ssahasra)
<details>
<summary>Changes</summary>
The new instruction represents the unknown number of waitcnts needed at a
release operation to ensure that prior direct loads to LDS (formerly called LDS
DMA) are completed. The instruction is replaced in SIInsertWaitcnts with a
suitable value for vmcnt().
Co-authored-by: Austin Kerbow <austin.kerbow@<!-- -->amd.com>.
---
Patch is 39.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150887.diff
6 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+20)
- (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (+20)
- (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+7)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll (+12)
- (added) llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir (+133)
- (added) llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll (+543)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index dd3f2fe25a239..9a4360374621d 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1381,6 +1381,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
Modified = true;
} else
WaitcntInstr = &II;
+ } else if (Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT) {
+ assert(ST->hasVMemToLDSLoad());
+ LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_LDS_DIRECT: " << II
+ << "Before: " << Wait.LoadCnt << '\n';);
+ ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
+ LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
+
+ // It is possible (but unlikely) that this is the only wait instruction,
+ // in which case, we exit this loop without a WaitcntInstr to consume
+ // `Wait`. But that works because `Wait` was passed in by reference, and
+ // the callee eventually calls createNewWaitcnt on it. We test this
+ // possibility in an articial MIR test since such a situation cannot be
+ // recreated by running the memory legalizer.
+ II.eraseFromParent();
} else {
assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
@@ -1552,6 +1566,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
UpdatableInstr = &CombinedStoreDsCntInstr;
+ } else if (Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT) {
+ // Architectures higher than GFX10 do not have direct loads to
+ // LDS, so no work required here yet.
+ II.eraseFromParent();
+ continue;
} else {
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
assert(CT.has_value());
@@ -2442,6 +2461,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
+ Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT ||
counterTypeForInstr(Opcode).has_value();
}
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 0e8a420fbb70a..30c180c0e420e 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}
+ // On architectures that support direct loads to LDS, emit an unknown waitcnt
+ // at workgroup-scoped release operations that specify the LDS address space.
+ // SIInsertWaitcnts will later replace this with a vmcnt().
+ if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
+ Scope == SIAtomicScope::WORKGROUP &&
+ any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT));
+ Changed = true;
+ }
+
if (Pos == Position::AFTER)
--MI;
@@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}
+ // On architectures that support direct loads to LDS, emit an unknown waitcnt
+ // at workgroup-scoped release operations that specify the LDS address space.
+ // SIInsertWaitcnts will later replace this with a vmcnt().
+ if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
+ Scope == SIAtomicScope::WORKGROUP &&
+ any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT));
+ Changed = true;
+ }
+
if (VSCnt) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e103ccc2f00e6..09630e20840cf 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in {
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}
+// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
+// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
+
+def S_WAITCNT_LDS_DIRECT : SPseudoInstSI<(outs), (ins)> {
+ let hasSideEffects = 0;
+}
+
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 66037615f0ba0..5fd8553820685 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -545,11 +545,13 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_release
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_release
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_release
@@ -578,12 +580,14 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
@@ -613,12 +617,14 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
@@ -1293,12 +1299,14 @@ define amdgpu_kernel void @workgroup_release() #0 {
; GFX10WGP-LABEL: name: workgroup_release
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_release
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_release
@@ -1330,6 +1338,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
; GFX10WGP-LABEL: name: workgroup_acq_rel
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
@@ -1337,6 +1346,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
; GFX10CU-LABEL: name: workgroup_acq_rel
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_acq_rel
@@ -1369,6 +1379,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
; GFX10WGP-LABEL: name: workgroup_seq_cst
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
@@ -1376,6 +1387,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
; GFX10CU-LABEL: name: workgroup_seq_cst
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
new file mode 100644
index 0000000000000..b376360157141
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
@@ -0,0 +1,133 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s
+
+
+# Expected vmcnt(0) since the direct load is the only load.
+---
+name: dma_then_fence
+body: |
+ bb.0:
+ ; GCN-LABEL: name: dma_then_fence
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $m0 = S_MOV_B32 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: S_WAITCNT 3952
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.
+
+---
+name: dma_then_global_load
+body: |
+ bb.0:
+ ; GCN-LABEL: name: dma_then_global_load
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $m0 = S_MOV_B32 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 3953
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# Expected no vmcnt since there is no direct load to LDS, and the global load is not processed by SIInsertWaitcnts.
+
+---
+name: no_dma_just_fence
+body: |
+ bb.0:
+ ; GCN-LABEL: name: no_dma_just_fence
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.
+
+---
+name: dma_then_system_fence
+body: |
+ bb.0:
+ ; GCN-LABEL: name: dma_then_system_fence
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 3953
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# The computed vmcnt(1) gets merged with the existing vmcnt(0).
+
+---
+name: merge_with_prev_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: merge_with_prev_wait
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $m0 = S_MOV_B32 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 3952
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# The computed vmcnt(1) gets merged with the existing vmcnt(0).
+
+---
+name: merge_with_next_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: merge_with_next_wait
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $m0 = S_MOV_B32 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 3952
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT_LDS_DIRECT
+ S_WAITCNT 3952
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
new file mode 100644
index 0000000000000..d23509b5aa812
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -0,0 +1,543 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GFX900
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s --check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX90A-TGSPLIT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX942-TGSPLIT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10WGP
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck %s -check-prefixes=GFX10CU
+
+; In each of these tests, an LDS DMA operation is followed by a release pattern
+; at workgroup scope. The fence in such a release (implicit or explicit) should
+; wait for the store component in the LDS DMA. The additional noalias metadata
+; is just meant to ensure that the wait counts are not generated due to some
+; unintended aliasing.
+
+declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
+
+define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
+; GFX900-LABEL: barrier_release:
+; GFX900: ; %bb.0: ; %main_body
+; GFX900-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX900-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX900-NEXT: v_mov_b32_e32 v1, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_mov_b32 m0, s12
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX900-NEXT: v_mov_b32_e32 v0, s13
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_barrier
+; GFX900-NEXT: ds_read_b32 v0, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_store_dword v1, v0, s[14:15]
+; GFX900-NEXT: s_endpgm
+;
+; GFX90A-LABEL: barrier_release:
+; GFX90A: ; %bb.1:
+; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_branch .LBB0_0
+; GFX90A-NEXT: .p2align 8
+; GFX90A-NEXT: ; %bb.2:
+; GFX90A-NEXT: .LBB0_0: ; %main_body
+; GFX90A-NEXT: s_mov_b32 m0, s12
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX90A-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX90A-NEXT: v_mov_b32_e32 v0, s13
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_barrier
+; GFX90A-NEXT: ds_read_b32 v0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: barrier_release:
+; GFX90A-TGSPLIT: ; %bb.1:
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_branch .LBB0_0
+; GFX90A-TGSPLIT-NEXT: .p2align 8
+; GFX90A-TGSPLIT-NEXT: ; %bb.2:
+; GFX90A-TGSPLIT-NEXT: .LBB0_0: ; %main_body
+; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s13
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_barrier
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-LABEL: barrier_release:
+; GFX942: ; %bb.1:
+; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_branch .LBB0_0
+; GFX942-NEXT: .p2align 8
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: .LBB0_0: ; %main_body
+; GFX942-NEXT: s_mov_b32 m0, s12
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX942-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX942-NEXT: v_mov_b32_e32 v0, s13
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_barrier
+; GFX942-NEXT: ds_read_b32 v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/150887
More information about the llvm-commits
mailing list