[llvm] a34a024 - [AMDGPU][SIInsertWaitCnts] skip meta instructions early (#145720)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 1 09:32:52 PDT 2025
Author: Sameer Sahasrabuddhe
Date: 2025-07-01T22:02:48+05:30
New Revision: a34a02481246482a490c320afb7375ec24cb634e
URL: https://github.com/llvm/llvm-project/commit/a34a02481246482a490c320afb7375ec24cb634e
DIFF: https://github.com/llvm/llvm-project/commit/a34a02481246482a490c320afb7375ec24cb634e.diff
LOG: [AMDGPU][SIInsertWaitCnts] skip meta instructions early (#145720)
When iterating over a block, meta instructions have no effect on wait counts,
but their presence drops the reference to earlier waitcnt instructions before
they are processed. This results in spurious wait counts, which do not affect
correctness, but are also not required in the resulting program. Skipping meta
instructions as soon as they are seen cleans this up.
Added:
llvm/test/CodeGen/AMDGPU/waitcnt-trailing.mir
Modified:
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
llvm/test/CodeGen/AMDGPU/extract-subvector.ll
llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 69ea8aa6122aa..6414e81baae70 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1786,8 +1786,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
bool FlushVmCnt) {
setForceEmitWaitcnt();
- if (MI.isMetaInstruction())
- return false;
+ assert(!MI.isMetaInstruction());
AMDGPU::Waitcnt Wait;
@@ -2474,6 +2473,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
E = Block.instr_end();
Iter != E;) {
MachineInstr &Inst = *Iter;
+ if (Inst.isMetaInstruction()) {
+ ++Iter;
+ continue;
+ }
// Track pre-existing waitcnts that were added in earlier iterations or by
// the memory legalizer.
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 9ae7c4aaa1e95..b0439b1f7968f 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -2250,7 +2250,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX9-SDAG-NEXT: s_mov_b32 s32, s34
; GFX9-SDAG-NEXT: s_mov_b32 s34, s12
; GFX9-SDAG-NEXT: s_mov_b32 s33, s11
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
@@ -2317,7 +2316,6 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
; GFX9-GISEL-NEXT: s_mov_b32 s32, s34
; GFX9-GISEL-NEXT: s_mov_b32 s34, s12
; GFX9-GISEL-NEXT: s_mov_b32 s33, s11
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index bb66bb319d481..a07f1d8a02941 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -731,7 +731,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX9-NEXT: .LBB3_3: ; %exit
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0]
; GFX9-NEXT: s_movk_i32 s4, 0x8000
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
@@ -973,7 +972,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX9-NEXT: .LBB4_3: ; %exit
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1]
; GFX9-NEXT: s_movk_i32 s4, 0x8000
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
@@ -1217,7 +1215,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
; GFX9-NEXT: .LBB5_3: ; %exit
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4
; GFX9-NEXT: v_mov_b32_e32 v3, 0x3800
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
@@ -1595,7 +1592,6 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; GFX9-NEXT: s_movk_i32 s34, 0x3800
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, s35, v7
; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; GFX9-NEXT: v_cmp_gt_u16_sdwa vcc, v7, s34 src0_sel:WORD_1 src1_sel:DWORD
@@ -1933,7 +1929,6 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3900
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3d00
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v7
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc
; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v7, v0 src0_sel:WORD_1 src1_sel:DWORD
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
index 41082821bafe3..a8d94146195f4 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
@@ -127,7 +127,6 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s8, s10
; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -138,7 +137,6 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: .LBB1_4: ; %exit
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc
@@ -197,7 +195,6 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s8, s10
; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -208,7 +205,6 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: .LBB2_4: ; %exit
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
@@ -305,7 +301,6 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
; GCN-NEXT: v_cmp_gt_i64_e64 s[8:9], 0, v[12:13]
; GCN-NEXT: v_cmp_gt_i64_e64 s[10:11], 0, v[14:15]
-; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_gt_i64_e64 s[12:13], 0, v[16:17]
; GCN-NEXT: v_cmp_gt_i64_e64 s[14:15], 0, v[18:19]
; GCN-NEXT: v_cmp_gt_i64_e64 s[16:17], 0, v[4:5]
@@ -376,7 +371,6 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s8, s10
; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -387,7 +381,6 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: .LBB4_4: ; %exit
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
; GCN-NEXT: v_cmp_lt_f64_e32 vcc, -1.0, v[4:5]
; GCN-NEXT: v_cndmask_b32_e64 v1, v0, -2.0, vcc
@@ -446,7 +439,6 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
; GCN-NEXT: s_mov_b32 s11, 0xf000
; GCN-NEXT: s_mov_b32 s8, s10
; GCN-NEXT: s_mov_b32 s9, s10
-; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
@@ -457,7 +449,6 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: .LBB5_4: ; %exit
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0xbff00000
; GCN-NEXT: v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v1, -2.0, v0, vcc
@@ -554,7 +545,6 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i
; GCN-NEXT: v_cmp_nlt_f64_e64 s[6:7], -1.0, v[10:11]
; GCN-NEXT: v_cmp_nlt_f64_e64 s[8:9], -1.0, v[12:13]
; GCN-NEXT: v_cmp_nlt_f64_e64 s[10:11], -1.0, v[14:15]
-; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_nlt_f64_e64 s[12:13], -1.0, v[16:17]
; GCN-NEXT: v_cmp_nlt_f64_e64 s[14:15], -1.0, v[18:19]
; GCN-NEXT: v_cmp_nlt_f64_e64 s[16:17], -1.0, v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 3d9fff23107b0..2cf76554078a7 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -6171,13 +6171,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
; NOOPT-NEXT: v_mov_b32_e32 v11, v14
; NOOPT-NEXT: v_mov_b32_e32 v12, v13
; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0 offset:32
-; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; NOOPT-NEXT: ; implicit-def: $sgpr1
; NOOPT-NEXT: ; implicit-def: $sgpr1
; NOOPT-NEXT: ; implicit-def: $sgpr1
; NOOPT-NEXT: ; implicit-def: $sgpr1
; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
-; NOOPT-NEXT: s_waitcnt expcnt(0)
; NOOPT-NEXT: v_mov_b32_e32 v9, v4
; NOOPT-NEXT: v_mov_b32_e32 v10, v3
; NOOPT-NEXT: v_mov_b32_e32 v11, v2
@@ -7290,7 +7289,6 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; NOOPT-NEXT: s_waitcnt vmcnt(0)
; NOOPT-NEXT: ; implicit-def: $sgpr0
-; NOOPT-NEXT: s_waitcnt vmcnt(0)
; NOOPT-NEXT: ;;#ASMSTART
; NOOPT-NEXT: ; reg use v[0:3]
; NOOPT-NEXT: ;;#ASMEND
@@ -7313,7 +7311,6 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
; NOOPT-NEXT: s_waitcnt vmcnt(0)
; NOOPT-NEXT: ; implicit-def: $sgpr0
-; NOOPT-NEXT: s_waitcnt vmcnt(0)
; NOOPT-NEXT: ;;#ASMSTART
; NOOPT-NEXT: ; reg use v[0:3]
; NOOPT-NEXT: ;;#ASMEND
@@ -7534,7 +7531,6 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
; NOOPT-NEXT: s_waitcnt vmcnt(0)
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; NOOPT-NEXT: s_waitcnt vmcnt(0)
; NOOPT-NEXT: ;;#ASMSTART
; NOOPT-NEXT: ; reg use v[0:3]
; NOOPT-NEXT: ;;#ASMEND
@@ -7558,7 +7554,6 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
; NOOPT-NEXT: s_waitcnt vmcnt(0)
; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
; NOOPT-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; NOOPT-NEXT: s_waitcnt vmcnt(0)
; NOOPT-NEXT: ;;#ASMSTART
; NOOPT-NEXT: ; reg use v[0:3]
; NOOPT-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 8b600c835a160..74a72e04fa4ae 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -7367,7 +7367,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -7918,7 +7917,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -8215,7 +8213,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -8505,7 +8502,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -8777,7 +8773,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -9052,7 +9047,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -9349,7 +9343,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -9646,7 +9639,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -9943,7 +9935,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -10240,7 +10231,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -10533,7 +10523,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -10830,7 +10819,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -11127,7 +11115,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -18653,7 +18640,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -18928,7 +18914,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -19225,7 +19210,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -19515,7 +19499,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -19787,7 +19770,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -20062,7 +20044,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -20359,7 +20340,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -20656,7 +20636,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -20953,7 +20932,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -21250,7 +21228,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -21543,7 +21520,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -21840,7 +21816,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -22137,7 +22112,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 9c11781da56f2..be148464c156e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -6365,7 +6365,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -6642,7 +6641,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -6945,7 +6943,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -7241,7 +7238,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -7515,7 +7511,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -7792,7 +7787,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -8095,7 +8089,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -8398,7 +8391,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -8701,7 +8693,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -9004,7 +8995,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -9303,7 +9293,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -9606,7 +9595,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -9909,7 +9897,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -17555,7 +17542,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -18112,7 +18098,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -18415,7 +18400,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -18711,7 +18695,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -18985,7 +18968,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -19262,7 +19244,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -19565,7 +19546,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -19868,7 +19848,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -20171,7 +20150,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -20474,7 +20452,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -20773,7 +20750,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -21076,7 +21052,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
@@ -21379,7 +21354,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
index 34260c49ff92c..944951d3a536a 100644
--- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
@@ -31,7 +31,6 @@ define amdgpu_kernel void @simple_barrier(ptr addrspace(1) %arg) {
; GCN-NEXT: s_load_dword s2, s[0:1], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_barrier
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ; wave barrier
; GCN-NEXT: s_load_dword s3, s[0:1], 0x4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
index e7343c150ea70..b1e304e79348e 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -773,7 +773,6 @@ define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrs
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; kill: killed $vgpr0_vgpr1
; GCN-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
-; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
; GCN-NEXT: global_store_dword v3, v0, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-trailing.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-trailing.mir
new file mode 100644
index 0000000000000..90faebd6967bd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-trailing.mir
@@ -0,0 +1,20 @@
+# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s
+
+# Check that a trivial soft waitcnt at the end of a block is deleted even if it
+# is followed by a meta instruction.
+
+# CHECK-LABEL: name: waitcnt-no-redundant
+# CHECK: S_WAITCNT 0
+# CHECK: S_MOV_B32
+# CHECK-NOT: S_WAITCNT
+
+---
+name: waitcnt-no-redundant
+body: |
+ bb.1:
+ S_WAITCNT_soft 53119
+ $sgpr2 = S_MOV_B32 42
+ S_WAITCNT_soft 53119
+ $vgpr2 = IMPLICIT_DEF
+
+...
More information about the llvm-commits
mailing list