[llvm] [AMDGPU] introduce S_WAITCNT_LDS_DIRECT in the memory legalizer (PR #150887)
Sameer Sahasrabuddhe via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 27 23:34:24 PDT 2025
https://github.com/ssahasra created https://github.com/llvm/llvm-project/pull/150887
The new instruction represents the unknown number of waitcnts needed at a
release operation to ensure that prior direct loads to LDS (formerly called LDS
DMA) are completed. The instruction is replaced in SIInsertWaitcnts with a
suitable value for vmcnt().
Co-authored-by: Austin Kerbow <austin.kerbow at amd.com>.
>From 2efe0cd67fc0e7f2d035d7913cbf858493036fb0 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Tue, 17 Jun 2025 13:11:55 +0530
Subject: [PATCH 1/2] [AMDGCN] pre-checkin test for LDS DMA and release
operations
---
.../AMDGPU/lds-dma-workgroup-release.ll | 541 ++++++++++++++++++
1 file changed, 541 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
new file mode 100644
index 0000000000000..98e42a2c4c402
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -0,0 +1,541 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GFX900
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s --check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX90A-TGSPLIT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX942-TGSPLIT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10WGP
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck %s -check-prefixes=GFX10CU
+
+; In each of these tests, an LDS DMA operation is followed by a release pattern
+; at workgroup scope. The fence in such a release (implicit or explicit) should
+; wait for the store component in the LDS DMA. The additional noalias metadata
+; is just meant to ensure that the wait counts are not generated due to some
+; unintended aliasing.
+
+declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
+
+define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
+; GFX900-LABEL: barrier_release:
+; GFX900: ; %bb.0: ; %main_body
+; GFX900-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX900-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX900-NEXT: v_mov_b32_e32 v1, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_mov_b32 m0, s12
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX900-NEXT: v_mov_b32_e32 v0, s13
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_barrier
+; GFX900-NEXT: ds_read_b32 v0, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_store_dword v1, v0, s[14:15]
+; GFX900-NEXT: s_endpgm
+;
+; GFX90A-LABEL: barrier_release:
+; GFX90A: ; %bb.1:
+; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_branch .LBB0_0
+; GFX90A-NEXT: .p2align 8
+; GFX90A-NEXT: ; %bb.2:
+; GFX90A-NEXT: .LBB0_0: ; %main_body
+; GFX90A-NEXT: s_mov_b32 m0, s12
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX90A-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX90A-NEXT: v_mov_b32_e32 v0, s13
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_barrier
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: ds_read_b32 v0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: barrier_release:
+; GFX90A-TGSPLIT: ; %bb.1:
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_branch .LBB0_0
+; GFX90A-TGSPLIT-NEXT: .p2align 8
+; GFX90A-TGSPLIT-NEXT: ; %bb.2:
+; GFX90A-TGSPLIT-NEXT: .LBB0_0: ; %main_body
+; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s13
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_barrier
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-LABEL: barrier_release:
+; GFX942: ; %bb.1:
+; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_branch .LBB0_0
+; GFX942-NEXT: .p2align 8
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: .LBB0_0: ; %main_body
+; GFX942-NEXT: s_mov_b32 m0, s12
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX942-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX942-NEXT: v_mov_b32_e32 v0, s13
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_barrier
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: ds_read_b32 v0, v0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: barrier_release:
+; GFX942-TGSPLIT: ; %bb.1:
+; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_branch .LBB0_0
+; GFX942-TGSPLIT-NEXT: .p2align 8
+; GFX942-TGSPLIT-NEXT: ; %bb.2:
+; GFX942-TGSPLIT-NEXT: .LBB0_0: ; %main_body
+; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX942-TGSPLIT-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s13
+; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_barrier
+; GFX942-TGSPLIT-NEXT: buffer_inv sc0
+; GFX942-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX10WGP-LABEL: barrier_release:
+; GFX10WGP: ; %bb.0: ; %main_body
+; GFX10WGP-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10WGP-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX10WGP-NEXT: v_mov_b32_e32 v1, 0
+; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10WGP-NEXT: s_mov_b32 m0, s12
+; GFX10WGP-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX10WGP-NEXT: v_mov_b32_e32 v0, s13
+; GFX10WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10WGP-NEXT: s_barrier
+; GFX10WGP-NEXT: buffer_gl0_inv
+; GFX10WGP-NEXT: ds_read_b32 v0, v0
+; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10WGP-NEXT: global_store_dword v1, v0, s[14:15]
+; GFX10WGP-NEXT: s_endpgm
+;
+; GFX10CU-LABEL: barrier_release:
+; GFX10CU: ; %bb.0: ; %main_body
+; GFX10CU-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX10CU-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX10CU-NEXT: v_mov_b32_e32 v1, 0
+; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10CU-NEXT: s_mov_b32 m0, s12
+; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
+; GFX10CU-NEXT: v_mov_b32_e32 v0, s13
+; GFX10CU-NEXT: s_barrier
+; GFX10CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10CU-NEXT: ds_read_b32 v0, v0
+; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10CU-NEXT: global_store_dword v1, v0, s[14:15]
+; GFX10CU-NEXT: s_endpgm
+ ptr addrspace(3) inreg %lds1,
+ ptr addrspace(3) inreg %lds2,
+ ptr addrspace(1) %dummy2) {
+main_body:
+ call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102
+ fence syncscope("workgroup") release
+ tail call void @llvm.amdgcn.s.barrier()
+ fence syncscope("workgroup") acquire
+ %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105
+ store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105
+ ret void
+}
+
+define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
+; GFX900-LABEL: fence_fence:
+; GFX900: ; %bb.0: ; %main_body
+; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c
+; GFX900-NEXT: v_mov_b32_e32 v1, 0x800
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_mov_b32 m0, s6
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds
+; GFX900-NEXT: v_mov_b32_e32 v1, 1
+; GFX900-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX900-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v1, s7
+; GFX900-NEXT: ds_read_b32 v1, v1
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX900-NEXT: s_endpgm
+;
+; GFX90A-LABEL: fence_fence:
+; GFX90A: ; %bb.1:
+; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_branch .LBB1_0
+; GFX90A-NEXT: .p2align 8
+; GFX90A-NEXT: ; %bb.2:
+; GFX90A-NEXT: .LBB1_0: ; %main_body
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c
+; GFX90A-NEXT: s_mov_b32 m0, s12
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x800
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v1, s13
+; GFX90A-NEXT: ds_read_b32 v1, v1
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: fence_fence:
+; GFX90A-TGSPLIT: ; %bb.1:
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_branch .LBB1_0
+; GFX90A-TGSPLIT-NEXT: .p2align 8
+; GFX90A-TGSPLIT-NEXT: ; %bb.2:
+; GFX90A-TGSPLIT-NEXT: .LBB1_0: ; %main_body
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c
+; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-LABEL: fence_fence:
+; GFX942: ; %bb.1:
+; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_branch .LBB1_0
+; GFX942-NEXT: .p2align 8
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: .LBB1_0: ; %main_body
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c
+; GFX942-NEXT: s_mov_b32 m0, s12
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x800
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
+; GFX942-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0
+; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, s13
+; GFX942-NEXT: ds_read_b32 v1, v1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: fence_fence:
+; GFX942-TGSPLIT: ; %bb.1:
+; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_branch .LBB1_0
+; GFX942-TGSPLIT-NEXT: .p2align 8
+; GFX942-TGSPLIT-NEXT: ; %bb.2:
+; GFX942-TGSPLIT-NEXT: .LBB1_0: ; %main_body
+; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c
+; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0
+; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13
+; GFX942-TGSPLIT-NEXT: buffer_inv sc0
+; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX10WGP-LABEL: fence_fence:
+; GFX10WGP: ; %bb.0: ; %main_body
+; GFX10WGP-NEXT: s_clause 0x2
+; GFX10WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10WGP-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c
+; GFX10WGP-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX10WGP-NEXT: v_mov_b32_e32 v1, 0
+; GFX10WGP-NEXT: v_mov_b32_e32 v2, 1
+; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10WGP-NEXT: s_mov_b32 m0, s6
+; GFX10WGP-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds
+; GFX10WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10WGP-NEXT: global_store_dword v1, v2, s[8:9]
+; GFX10WGP-NEXT: global_load_dword v0, v1, s[8:9] glc
+; GFX10WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10WGP-NEXT: v_mov_b32_e32 v0, s7
+; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10WGP-NEXT: buffer_gl0_inv
+; GFX10WGP-NEXT: ds_read_b32 v0, v0
+; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10WGP-NEXT: global_store_dword v1, v0, s[10:11]
+; GFX10WGP-NEXT: s_endpgm
+;
+; GFX10CU-LABEL: fence_fence:
+; GFX10CU: ; %bb.0: ; %main_body
+; GFX10CU-NEXT: s_clause 0x2
+; GFX10CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10CU-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c
+; GFX10CU-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX10CU-NEXT: v_mov_b32_e32 v1, 0
+; GFX10CU-NEXT: v_mov_b32_e32 v2, 1
+; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10CU-NEXT: s_mov_b32 m0, s6
+; GFX10CU-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds
+; GFX10CU-NEXT: global_store_dword v1, v2, s[8:9]
+; GFX10CU-NEXT: global_load_dword v0, v1, s[8:9]
+; GFX10CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10CU-NEXT: v_mov_b32_e32 v0, s7
+; GFX10CU-NEXT: ds_read_b32 v0, v0
+; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10CU-NEXT: global_store_dword v1, v0, s[10:11]
+; GFX10CU-NEXT: s_endpgm
+ ptr addrspace(3) inreg %lds1,
+ ptr addrspace(3) inreg %lds2,
+ ptr addrspace(1) %flag,
+ ptr addrspace(1) %dummy2) {
+main_body:
+ call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102
+ fence syncscope("workgroup") release
+ store atomic i32 1, ptr addrspace(1) %flag syncscope("workgroup") monotonic, align 4, !noalias !105
+ %unused_flag = load atomic i32, ptr addrspace(1) %flag syncscope("workgroup") monotonic, align 4, !noalias !105
+ fence syncscope("workgroup") acquire
+ %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105
+ store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105
+ ret void
+}
+
+define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
+; GFX900-LABEL: release_acquire:
+; GFX900: ; %bb.0: ; %main_body
+; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c
+; GFX900-NEXT: v_mov_b32_e32 v1, 0x800
+; GFX900-NEXT: v_mov_b32_e32 v0, 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_mov_b32 m0, s6
+; GFX900-NEXT: s_nop 0
+; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds
+; GFX900-NEXT: v_mov_b32_e32 v1, 1
+; GFX900-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX900-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v1, s7
+; GFX900-NEXT: ds_read_b32 v1, v1
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX900-NEXT: s_endpgm
+;
+; GFX90A-LABEL: release_acquire:
+; GFX90A: ; %bb.1:
+; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_branch .LBB2_0
+; GFX90A-NEXT: .p2align 8
+; GFX90A-NEXT: ; %bb.2:
+; GFX90A-NEXT: .LBB2_0: ; %main_body
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c
+; GFX90A-NEXT: s_mov_b32 m0, s12
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x800
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v1, s13
+; GFX90A-NEXT: ds_read_b32 v1, v1
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: release_acquire:
+; GFX90A-TGSPLIT: ; %bb.1:
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: s_branch .LBB2_0
+; GFX90A-TGSPLIT-NEXT: .p2align 8
+; GFX90A-TGSPLIT-NEXT: ; %bb.2:
+; GFX90A-TGSPLIT-NEXT: .LBB2_0: ; %main_body
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c
+; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13
+; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-LABEL: release_acquire:
+; GFX942: ; %bb.1:
+; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_branch .LBB2_0
+; GFX942-NEXT: .p2align 8
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: .LBB2_0: ; %main_body
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c
+; GFX942-NEXT: s_mov_b32 m0, s12
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x800
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
+; GFX942-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0
+; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, s13
+; GFX942-NEXT: ds_read_b32 v1, v1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: release_acquire:
+; GFX942-TGSPLIT: ; %bb.1:
+; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_branch .LBB2_0
+; GFX942-TGSPLIT-NEXT: .p2align 8
+; GFX942-TGSPLIT-NEXT: ; %bb.2:
+; GFX942-TGSPLIT-NEXT: .LBB2_0: ; %main_body
+; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c
+; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0
+; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: buffer_inv sc0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13
+; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX10WGP-LABEL: release_acquire:
+; GFX10WGP: ; %bb.0: ; %main_body
+; GFX10WGP-NEXT: s_clause 0x2
+; GFX10WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10WGP-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c
+; GFX10WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX10WGP-NEXT: v_mov_b32_e32 v1, 0x800
+; GFX10WGP-NEXT: v_mov_b32_e32 v2, 1
+; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10WGP-NEXT: s_mov_b32 m0, s6
+; GFX10WGP-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds
+; GFX10WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10WGP-NEXT: global_store_dword v0, v2, s[8:9]
+; GFX10WGP-NEXT: global_load_dword v1, v0, s[8:9] glc
+; GFX10WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10WGP-NEXT: buffer_gl0_inv
+; GFX10WGP-NEXT: v_mov_b32_e32 v1, s7
+; GFX10WGP-NEXT: ds_read_b32 v1, v1
+; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10WGP-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX10WGP-NEXT: s_endpgm
+;
+; GFX10CU-LABEL: release_acquire:
+; GFX10CU: ; %bb.0: ; %main_body
+; GFX10CU-NEXT: s_clause 0x2
+; GFX10CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX10CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10CU-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c
+; GFX10CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX10CU-NEXT: v_mov_b32_e32 v1, 0x800
+; GFX10CU-NEXT: v_mov_b32_e32 v2, 1
+; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10CU-NEXT: s_mov_b32 m0, s6
+; GFX10CU-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds
+; GFX10CU-NEXT: global_store_dword v0, v2, s[8:9]
+; GFX10CU-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX10CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10CU-NEXT: v_mov_b32_e32 v1, s7
+; GFX10CU-NEXT: ds_read_b32 v1, v1
+; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10CU-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX10CU-NEXT: s_endpgm
+ ptr addrspace(3) inreg %lds1,
+ ptr addrspace(3) inreg %lds2,
+ ptr addrspace(1) %flag,
+ ptr addrspace(1) %dummy2) {
+main_body:
+ call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102
+ store atomic i32 1, ptr addrspace(1) %flag syncscope("workgroup") release, align 4, !noalias !105
+ %unused_flag = load atomic i32, ptr addrspace(1) %flag syncscope("workgroup") acquire, align 4, !noalias !105
+ %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105
+ store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105
+ ret void
+}
+
+!100 = !{!100}
+!101 = !{!101, !100}
+!102 = !{!101}
+!103 = !{!103, !100}
+!104 = !{!103}
+!105 = !{!101, !103}
>From a997d3282b52ed3f2820c1570c4c979efa4ed3bb Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Fri, 25 Jul 2025 20:16:21 +0530
Subject: [PATCH 2/2] [AMDGPU] introduce S_WAITCNT_LDS_DIRECT in the memory
legalizer
The new instruction represents the unknown number of waitcnts needed at a
release operation to ensure that prior direct loads to LDS (formerly called LDS
DMA) are completed. The instruction is replaced in SIInsertWaitcnts with a
suitable value for vmcnt().
Co-authored-by: Austin Kerbow <austin.kerbow at amd.com>.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 20 +++
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 20 +++
llvm/lib/Target/AMDGPU/SOPInstructions.td | 7 +
.../memory-legalizer-atomic-fence.ll | 12 ++
.../AMDGPU/insert-waitcnts-fence-soft.mir | 133 ++++++++++++++++++
.../AMDGPU/lds-dma-workgroup-release.ll | 20 +--
6 files changed, 203 insertions(+), 9 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index dd3f2fe25a239..9a4360374621d 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1381,6 +1381,20 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
Modified = true;
} else
WaitcntInstr = &II;
+ } else if (Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT) {
+ assert(ST->hasVMemToLDSLoad());
+ LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_LDS_DIRECT: " << II
+ << "Before: " << Wait.LoadCnt << '\n';);
+ ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
+ LLVM_DEBUG(dbgs() << "After: " << Wait.LoadCnt << '\n';);
+
+ // It is possible (but unlikely) that this is the only wait instruction,
+ // in which case, we exit this loop without a WaitcntInstr to consume
+ // `Wait`. But that works because `Wait` was passed in by reference, and
+ // the callee eventually calls createNewWaitcnt on it. We test this
+ // possibility in an articial MIR test since such a situation cannot be
+ // recreated by running the memory legalizer.
+ II.eraseFromParent();
} else {
assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
@@ -1552,6 +1566,11 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
ScoreBrackets.simplifyWaitcnt(OldWait);
Wait = Wait.combined(OldWait);
UpdatableInstr = &CombinedStoreDsCntInstr;
+ } else if (Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT) {
+ // Architectures higher than GFX10 do not have direct loads to
+ // LDS, so no work required here yet.
+ II.eraseFromParent();
+ continue;
} else {
std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
assert(CT.has_value());
@@ -2442,6 +2461,7 @@ static bool isWaitInstr(MachineInstr &Inst) {
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
+ Opcode == AMDGPU::S_WAITCNT_LDS_DIRECT ||
counterTypeForInstr(Opcode).has_value();
}
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 0e8a420fbb70a..30c180c0e420e 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1170,6 +1170,16 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}
+ // On architectures that support direct loads to LDS, emit an unknown waitcnt
+ // at workgroup-scoped release operations that specify the LDS address space.
+ // SIInsertWaitcnts will later replace this with a vmcnt().
+ if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
+ Scope == SIAtomicScope::WORKGROUP &&
+ any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT));
+ Changed = true;
+ }
+
if (Pos == Position::AFTER)
--MI;
@@ -2078,6 +2088,16 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
Changed = true;
}
+ // On architectures that support direct loads to LDS, emit an unknown waitcnt
+ // at workgroup-scoped release operations that specify the LDS address space.
+ // SIInsertWaitcnts will later replace this with a vmcnt().
+ if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
+ Scope == SIAtomicScope::WORKGROUP &&
+ any((SIAtomicAddrSpace)AddrSpace & SIAtomicAddrSpace::LDS)) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_LDS_DIRECT));
+ Changed = true;
+ }
+
if (VSCnt) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e103ccc2f00e6..09630e20840cf 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1621,6 +1621,13 @@ let OtherPredicates = [HasImageInsts] in {
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}
+// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
+// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
+
+def S_WAITCNT_LDS_DIRECT : SPseudoInstSI<(outs), (ins)> {
+ let hasSideEffects = 0;
+}
+
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 66037615f0ba0..5fd8553820685 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -545,11 +545,13 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_release
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_release
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_release
@@ -578,12 +580,14 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
@@ -613,12 +617,14 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 16240
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
; GFX10CU: bb.0.entry:
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
@@ -1293,12 +1299,14 @@ define amdgpu_kernel void @workgroup_release() #0 {
; GFX10WGP-LABEL: name: workgroup_release
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: S_ENDPGM 0
;
; GFX10CU-LABEL: name: workgroup_release
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_release
@@ -1330,6 +1338,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
; GFX10WGP-LABEL: name: workgroup_acq_rel
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
@@ -1337,6 +1346,7 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
; GFX10CU-LABEL: name: workgroup_acq_rel
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_acq_rel
@@ -1369,6 +1379,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
; GFX10WGP-LABEL: name: workgroup_seq_cst
; GFX10WGP: bb.0.entry:
; GFX10WGP-NEXT: S_WAITCNT_soft 112
+ ; GFX10WGP-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10WGP-NEXT: S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
; GFX10WGP-NEXT: BUFFER_GL0_INV implicit $exec
; GFX10WGP-NEXT: S_ENDPGM 0
@@ -1376,6 +1387,7 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
; GFX10CU-LABEL: name: workgroup_seq_cst
; GFX10CU: bb.0.entry:
; GFX10CU-NEXT: S_WAITCNT_soft 49279
+ ; GFX10CU-NEXT: S_WAITCNT_LDS_DIRECT
; GFX10CU-NEXT: S_ENDPGM 0
;
; GFX11WGP-LABEL: name: workgroup_seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
new file mode 100644
index 0000000000000..b376360157141
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-fence-soft.mir
@@ -0,0 +1,133 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GCN %s
+
+
+# Expected vmcnt(0) since the direct load is the only load.
+---
+name: dma_then_fence
+body: |
+ bb.0:
+ ; GCN-LABEL: name: dma_then_fence
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $m0 = S_MOV_B32 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: S_WAITCNT 3952
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.
+
+---
+name: dma_then_global_load
+body: |
+ bb.0:
+ ; GCN-LABEL: name: dma_then_global_load
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $m0 = S_MOV_B32 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 3953
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# Expected no vmcnt since there is no direct load to LDS, and the global load is not processed by SIInsertWaitcnts.
+
+---
+name: no_dma_just_fence
+body: |
+ bb.0:
+ ; GCN-LABEL: name: no_dma_just_fence
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# Expected vmcnt(1) since the global load is not processed by SIInsertWaitcnts.
+
+---
+name: dma_then_system_fence
+body: |
+ bb.0:
+ ; GCN-LABEL: name: dma_then_system_fence
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 3953
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# The computed vmcnt(1) gets merged with the existing vmcnt(0).
+
+---
+name: merge_with_prev_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: merge_with_prev_wait
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $m0 = S_MOV_B32 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 3952
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ S_WAITCNT_LDS_DIRECT
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# The computed vmcnt(1) gets merged with the existing vmcnt(0).
+
+---
+name: merge_with_next_wait
+body: |
+ bb.0:
+ ; GCN-LABEL: name: merge_with_next_wait
+ ; GCN: S_WAITCNT 0
+ ; GCN-NEXT: $m0 = S_MOV_B32 0
+ ; GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4, addrspace 1), (store (s32) into `ptr addrspace(3) poison` + 4, addrspace 3)
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 3952
+ ; GCN-NEXT: $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec
+ S_WAITCNT_LDS_DIRECT
+ S_WAITCNT 3952
+ $vgpr1 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index 98e42a2c4c402..d23509b5aa812 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -47,9 +47,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX90A-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
; GFX90A-NEXT: v_mov_b32_e32 v0, s13
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_barrier
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ds_read_b32 v0, v0
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
@@ -93,9 +92,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX942-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
; GFX942-NEXT: v_mov_b32_e32 v0, s13
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_barrier
-; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: ds_read_b32 v0, v0
; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
@@ -151,8 +149,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX10CU-NEXT: s_mov_b32 m0, s12
; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
; GFX10CU-NEXT: v_mov_b32_e32 v0, s13
-; GFX10CU-NEXT: s_barrier
; GFX10CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10CU-NEXT: s_barrier
; GFX10CU-NEXT: ds_read_b32 v0, v0
; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10CU-NEXT: global_store_dword v1, v0, s[14:15]
@@ -183,6 +181,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
; GFX900-NEXT: s_nop 0
; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds
; GFX900-NEXT: v_mov_b32_e32 v1, 1
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v0, v1, s[8:9]
; GFX900-NEXT: global_load_dword v1, v0, s[8:9]
; GFX900-NEXT: s_waitcnt vmcnt(0)
@@ -207,7 +206,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
; GFX90A-NEXT: v_mov_b32_e32 v1, 1
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-NEXT: global_load_dword v1, v0, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -258,7 +257,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
; GFX942-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0
; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -330,6 +329,7 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10CU-NEXT: s_mov_b32 m0, s6
; GFX10CU-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds
+; GFX10CU-NEXT: s_waitcnt vmcnt(0)
; GFX10CU-NEXT: global_store_dword v1, v2, s[8:9]
; GFX10CU-NEXT: global_load_dword v0, v1, s[8:9]
; GFX10CU-NEXT: s_waitcnt vmcnt(0)
@@ -366,6 +366,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
; GFX900-NEXT: s_nop 0
; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds
; GFX900-NEXT: v_mov_b32_e32 v1, 1
+; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v0, v1, s[8:9]
; GFX900-NEXT: global_load_dword v1, v0, s[8:9]
; GFX900-NEXT: s_waitcnt vmcnt(0)
@@ -390,7 +391,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
; GFX90A-NEXT: v_mov_b32_e32 v1, 1
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-NEXT: global_load_dword v1, v0, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -441,7 +442,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
; GFX942-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0
; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -512,6 +513,7 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10CU-NEXT: s_mov_b32 m0, s6
; GFX10CU-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds
+; GFX10CU-NEXT: s_waitcnt vmcnt(0)
; GFX10CU-NEXT: global_store_dword v0, v2, s[8:9]
; GFX10CU-NEXT: global_load_dword v1, v0, s[8:9]
; GFX10CU-NEXT: s_waitcnt vmcnt(0)
More information about the llvm-commits
mailing list