[llvm] 3cf539f - [AMDGPU] Combine or remove redundant waitcnts at the end of each MBB (#87539)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 4 02:14:19 PDT 2024
Author: Jay Foad
Date: 2024-04-04T10:14:16+01:00
New Revision: 3cf539fb046457a444e93cefc87cca10cbd3b807
URL: https://github.com/llvm/llvm-project/commit/3cf539fb046457a444e93cefc87cca10cbd3b807
DIFF: https://github.com/llvm/llvm-project/commit/3cf539fb046457a444e93cefc87cca10cbd3b807.diff
LOG: [AMDGPU] Combine or remove redundant waitcnts at the end of each MBB (#87539)
Call generateWaitcnt unconditionally at the end of
SIInsertWaitcnts::insertWaitcntInBlock. Even if we don't need to
generate a new waitcnt instruction it has the effect of combining or
removing redundant waitcnts that were already present. Tests show
various small improvements in waitcnt placement.
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
llvm/test/CodeGen/AMDGPU/function-args.ll
llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 27621906e4c5fd..bb499c5c8c578e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -708,9 +708,6 @@ class SIInsertWaitcnts : public MachineFunctionPass {
WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr,
bool FlushVmCnt);
- bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
- WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr);
bool generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
@@ -1902,31 +1899,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
OldWaitcntInstr);
}
-// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
-// end of the given block if needed.
-bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
- WaitcntBrackets &ScoreBrackets,
- MachineInstr *OldWaitcntInstr) {
- AMDGPU::Waitcnt Wait;
-
- unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
- unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
- unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
-
- if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
- return false;
-
- if (LoadCntPending != 0)
- Wait.LoadCnt = 0;
- if (SampleCntPending != 0)
- Wait.SampleCnt = 0;
- if (BvhCntPending != 0)
- Wait.BvhCnt = 0;
-
- return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
- OldWaitcntInstr);
-}
-
bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
MachineBasicBlock::instr_iterator It,
MachineBasicBlock &Block,
@@ -2355,9 +2327,22 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
++Iter;
}
+ // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
+ // needed.
+ AMDGPU::Waitcnt Wait;
if (Block.getFirstTerminator() == Block.end() &&
- isPreheaderToFlush(Block, ScoreBrackets))
- Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
+ isPreheaderToFlush(Block, ScoreBrackets)) {
+ if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
+ Wait.LoadCnt = 0;
+ if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
+ Wait.SampleCnt = 0;
+ if (ScoreBrackets.hasPendingEvent(BVH_CNT))
+ Wait.BvhCnt = 0;
+ }
+
+ // Combine or remove any redundant waitcnts at the end of the block.
+ Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
+ OldWaitcntInstr);
return Modified;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index c25b0f21282663..78d908455e019b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -16,7 +16,6 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: .LBB0_2: ; %endif
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%c = icmp ne i32 %value, 0
@@ -44,7 +43,6 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: .LBB1_2: ; %endif
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%c = icmp ne i32 %value, 0
@@ -74,7 +72,6 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) {
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: .LBB2_2: ; %endif
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%c = trunc i32 %value to i1
@@ -106,7 +103,6 @@ define i32 @divergent_if_nonboolean_condition1(ptr addrspace(1) %ptr) {
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: .LBB3_2: ; %endif
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%value = load i32, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
index 303dc46e2c8845..5c22d5bdcf7449 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB1_2: ; %bb1
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%val = call i1 @llvm.amdgcn.is.private(ptr %ptr)
br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
index 63702d2587574b..e005c38355a3ce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB1_2: ; %bb1
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%val = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 352adacbda3e4b..af6f69130910d0 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -39,9 +39,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -65,11 +65,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -92,11 +92,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -253,8 +253,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -504,11 +504,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -544,11 +544,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_add_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -944,7 +944,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
@@ -952,6 +951,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -974,7 +974,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB4_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_readfirstlane_b32 s3, v1
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1006,7 +1005,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_readfirstlane_b32 s3, v1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1219,11 +1217,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
; GFX8-NEXT: s_mov_b32 s7, 0xf000
@@ -1258,11 +1256,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
@@ -1530,10 +1528,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -1557,12 +1555,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB7_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -1585,12 +1583,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB7_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -1751,8 +1749,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -2006,11 +2004,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB9_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -2046,11 +2044,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB9_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -2446,7 +2444,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
@@ -2454,6 +2451,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -2477,7 +2475,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB11_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
@@ -2487,6 +2484,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -2509,7 +2507,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB11_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
@@ -2519,6 +2516,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3081,11 +3079,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB14_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -3121,11 +3119,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB14_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3355,11 +3353,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB15_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_or_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -3395,11 +3393,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB15_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_or_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3629,11 +3627,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB16_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_xor_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -3669,11 +3667,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB16_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_xor_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3903,11 +3901,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB17_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_max_i32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -3943,11 +3941,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB17_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_max_i32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -4151,7 +4149,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1
@@ -4162,6 +4159,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -4182,7 +4180,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB18_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, 1
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4216,7 +4213,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB18_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, 1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4419,11 +4415,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB19_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_min_i32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -4459,11 +4455,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB19_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_min_i32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -4667,7 +4663,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2
@@ -4678,6 +4673,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -4698,7 +4694,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB20_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, -2
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4732,7 +4727,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB20_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, -2
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4935,11 +4929,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB21_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -4975,11 +4969,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB21_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -5183,7 +5177,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
@@ -5193,6 +5186,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -5214,7 +5208,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB22_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
@@ -5226,6 +5219,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -5246,7 +5240,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB22_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -5258,6 +5251,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -5446,11 +5440,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB23_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_min_u32_e32 v0, s4, v1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -5486,11 +5480,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB23_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_min_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -5694,7 +5688,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5704,6 +5697,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -5725,7 +5719,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: .LBB24_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5737,6 +5730,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -5757,7 +5751,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: .LBB24_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5769,6 +5762,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 19a1d2d9dbd1ef..c9076a9541b237 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -186,7 +186,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: .LBB1_8: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index ac50fb86c96f7e..da609bfa8edea6 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add
; GCN-NEXT: .LBB0_2: ; %endif
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0000
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:2300
; GCN-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 069c57e2ae63e2..6dabd8c0b83eae 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -103,7 +103,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB0_4: ; %exit
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
; GFX9-NEXT: s_movk_i32 s4, 0x8000
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
@@ -131,7 +130,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB0_4: ; %exit
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -266,7 +264,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB1_4: ; %exit
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
; GFX9-NEXT: s_movk_i32 s4, 0x8000
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
@@ -294,7 +291,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB1_4: ; %exit
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -431,7 +427,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
; GFX9-NEXT: .LBB2_4: ; %exit
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2
; GFX9-NEXT: v_mov_b32_e32 v5, 0x3800
; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
@@ -461,7 +456,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB2_4: ; %exit
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -665,7 +659,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB3_4: ; %exit
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -871,7 +864,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB4_4: ; %exit
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1081,7 +1073,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB5_4: ; %exit
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -1432,7 +1423,6 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB7_4: ; %exit
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5
; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900
; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00
@@ -1724,7 +1714,6 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB8_4: ; %exit
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v5
; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900
; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index db89ad66ffab0d..3b2f15c8340a63 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -114,7 +114,6 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
; CIGFX89-NEXT: s_waitcnt vmcnt(0)
; CIGFX89-NEXT: .LBB3_2: ; %bb2
; CIGFX89-NEXT: s_or_b64 exec, exec, s[4:5]
-; CIGFX89-NEXT: s_waitcnt vmcnt(0)
; CIGFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i1_arg_i1_use:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 433a836e7ca030..3b3e107a62967c 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -33,7 +33,7 @@ define void @func_use_lds_global() {
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX8-SDAG-NEXT: s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -103,7 +103,7 @@ define void @func_use_lds_global_constexpr_cast() {
; GFX8-SDAG-LABEL: func_use_lds_global_constexpr_cast:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-SDAG-NEXT: s_trap 2
@@ -171,7 +171,7 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_2
; GFX8-SDAG-NEXT: ; %bb.1: ; %bb1
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 1
-; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -181,7 +181,7 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_4
; GFX8-SDAG-NEXT: ; %bb.3: ; %bb0
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -189,7 +189,7 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX8-SDAG-NEXT: .LBB2_4: ; %ret
; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -379,7 +379,7 @@ define void @func_uses_lds_code_after(ptr addrspace(1) %ptr) {
; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX8-SDAG-NEXT: s_mov_b32 m0, -1
; GFX8-SDAG-NEXT: ds_write_b32 v0, v2
-; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 1
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -472,7 +472,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GFX8-SDAG-NEXT: ; %bb.1: ; %use.bb
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX8-SDAG-NEXT: s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0xc8
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
@@ -481,7 +481,6 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX8-SDAG-NEXT: .LBB4_2: ; %ret
; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: func_uses_lds_phi_after:
@@ -506,7 +505,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX8-GISEL-NEXT: .LBB4_2: ; %ret
; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: func_uses_lds_phi_after:
@@ -527,7 +526,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: .LBB4_2: ; %ret
; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: func_uses_lds_phi_after:
@@ -548,7 +547,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: .LBB4_2: ; %ret
; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-LABEL: func_uses_lds_phi_after:
@@ -570,7 +569,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: .LBB4_3: ; %ret
; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
-; SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
; SDAG-NEXT: .LBB4_4:
; SDAG-NEXT: s_endpgm
@@ -594,7 +593,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: .LBB4_3: ; %ret
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
; GISEL-NEXT: .LBB4_4:
; GISEL-NEXT: s_endpgm
@@ -616,6 +615,3 @@ ret:
; CHECK: {{.*}}
; GFX8: {{.*}}
; GFX9: {{.*}}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 5e76dfd9bdddbd..4477f028c6d2f2 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -157,7 +157,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
; VI-NEXT: .LBB2_2:
; VI-NEXT: s_or_b64 exec, exec, s[6:7]
; VI-NEXT: s_mov_b64 s[6:7], exec
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_readfirstlane_b32 s8, v1
; VI-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0
; VI-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1
@@ -203,15 +202,14 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
; VI-NEXT: ; %bb.7:
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_add_rtn_f32 v2, v2, v1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: .LBB2_8:
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_readfirstlane_b32 s2, v2
; VI-NEXT: v_add_f32_e32 v2, s2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -240,7 +238,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
; GFX9-NEXT: .LBB2_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1
@@ -285,16 +282,15 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
; GFX9-NEXT: s_cbranch_execz .LBB2_8
; GFX9-NEXT: ; %bb.7:
; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_f32 v2, v2, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB2_8:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v2
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_add_f32_e32 v0, s2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 138dd53b3ede41..d19ef75cb08cd5 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1260,8 +1260,6 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB11_5: ; %end
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB11_6:
; GFX11-NEXT: s_mov_b64 exec, 0
@@ -1525,8 +1523,6 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB13_5: ; %UnifiedReturnBlock
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB13_6:
; GFX11-NEXT: s_mov_b64 exec, 0
diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index eef5f57beb07d7..ecebbb9ac874f8 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -32,7 +32,7 @@ define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GCN-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN-NEXT: S_WAITCNT_soft 3952
+ ; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.3:
entry:
@@ -79,7 +79,7 @@ define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a,
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; GCN-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN-NEXT: S_WAITCNT_soft 3952
+ ; GCN-NEXT: S_WAITCNT 3952
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.5:
entry:
More information about the llvm-commits
mailing list