[llvm] 8a91b68 - [AMDGPU] Limit memory scope for scratch, LDS and GDS
Tony Tye via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 14 09:35:29 PST 2021
Author: Tony Tye
Date: 2021-02-14T17:34:12Z
New Revision: 8a91b68b95e6d1fd31e2a62a61ecb3a3506cf837
URL: https://github.com/llvm/llvm-project/commit/8a91b68b95e6d1fd31e2a62a61ecb3a3506cf837
DIFF: https://github.com/llvm/llvm-project/commit/8a91b68b95e6d1fd31e2a62a61ecb3a3506cf837.diff
LOG: [AMDGPU] Limit memory scope for scratch, LDS and GDS
Changes for AMD GPU SIMemoryLegalizer:
- Limit the memory scope to maximum supported by the scratch, LDS and
GDS address spaces.
- Improve assertion checking.
- Correct toSIAtomicScope argument name.
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D96643
Added:
Modified:
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 3caa75e4d958..40173bcc477b 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -129,12 +129,43 @@ class SIMemOpInfo final {
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
IsVolatile(IsVolatile),
IsNonTemporal(IsNonTemporal) {
+
+ if (Ordering == AtomicOrdering::NotAtomic) {
+ assert(Scope == SIAtomicScope::NONE &&
+ OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
+ !IsCrossAddressSpaceOrdering &&
+ FailureOrdering == AtomicOrdering::NotAtomic);
+ return;
+ }
+
+ assert(Scope != SIAtomicScope::NONE &&
+ (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
+ SIAtomicAddrSpace::NONE &&
+ (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
+ SIAtomicAddrSpace::NONE &&
+ !isStrongerThan(FailureOrdering, Ordering));
+
// There is also no cross address space ordering if the ordering
// address space is the same as the instruction address space and
// only contains a single address space.
if ((OrderingAddrSpace == InstrAddrSpace) &&
isPowerOf2_32(uint32_t(InstrAddrSpace)))
this->IsCrossAddressSpaceOrdering = false;
+
+ // Limit the scope to the maximum supported by the instruction's address
+ // spaces.
+ if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
+ SIAtomicAddrSpace::NONE) {
+ this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
+ } else if ((InstrAddrSpace &
+ ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
+ SIAtomicAddrSpace::NONE) {
+ this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
+ } else if ((InstrAddrSpace &
+ ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
+ SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
+ this->Scope = std::min(Scope, SIAtomicScope::AGENT);
+ }
}
public:
@@ -202,12 +233,12 @@ class SIMemOpAccess final {
void reportUnsupported(const MachineBasicBlock::iterator &MI,
const char *Msg) const;
- /// Inspects the target synchonization scope \p SSID and determines
+ /// Inspects the target synchronization scope \p SSID and determines
/// the SI atomic scope it corresponds to, the address spaces it
/// covers, and whether the memory ordering applies between address
/// spaces.
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
- toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
+ toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
/// \return Return a bit set of the address spaces accessed by \p AS.
SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
@@ -476,7 +507,7 @@ void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
- SIAtomicAddrSpace InstrScope) const {
+ SIAtomicAddrSpace InstrAddrSpace) const {
if (SSID == SyncScope::System)
return std::make_tuple(SIAtomicScope::SYSTEM,
SIAtomicAddrSpace::ATOMIC,
@@ -499,23 +530,23 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
true);
if (SSID == MMI->getSystemOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SYSTEM,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getAgentOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::AGENT,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WORKGROUP,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::WAVEFRONT,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
return std::make_tuple(SIAtomicScope::SINGLETHREAD,
- SIAtomicAddrSpace::ATOMIC & InstrScope,
+ SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
false);
return None;
}
@@ -591,7 +622,8 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
ScopeOrNone.getValue();
if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
- ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
+ ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
+ ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
reportUnsupported(MI, "Unsupported atomic address space");
return None;
}
@@ -659,7 +691,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
}
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
- IsCrossAddressSpaceOrdering);
+ IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
}
Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index fe32aa069697..5c18b52b57bd 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -30,17 +30,16 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: BB0_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -59,18 +58,17 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5
; GFX8-NEXT: v_mov_b32_e32 v2, local_var32 at abs32@lo
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u32 v1, v2, v1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: s_nop 1
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -88,18 +86,17 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5
; GFX9-NEXT: v_mov_b32_e32 v2, local_var32 at abs32@lo
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u32 v1, v2, v1
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: s_nop 1
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -122,7 +119,6 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB0_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -153,7 +149,6 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1032-NEXT: ds_add_rtn_u32 v1, v2, v1
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB0_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -192,14 +187,13 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: BB1_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1
; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0
; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0
@@ -225,10 +219,9 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX8-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB1_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -257,10 +250,9 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX9-NEXT: s_mul_i32 s3, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB1_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -295,7 +287,6 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB1_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
@@ -330,7 +321,6 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB1_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
@@ -356,10 +346,9 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -401,18 +390,18 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB2_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -451,18 +440,18 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB2_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -516,7 +505,6 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB2_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -568,7 +556,6 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB2_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -595,10 +582,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -640,18 +626,18 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB3_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -690,18 +676,18 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB3_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -755,7 +741,6 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB3_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -807,7 +792,6 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB3_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -834,10 +818,9 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -879,18 +862,18 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB4_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -929,18 +912,18 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB4_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -994,7 +977,6 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB4_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -1046,7 +1028,6 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB4_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -1084,12 +1065,12 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5
; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: BB5_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0
@@ -1099,7 +1080,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0
; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -1119,12 +1099,12 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5
; GFX8-NEXT: v_mov_b32_e32 v3, local_var64 at abs32@lo
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_readfirstlane_b32 s3, v2
; GFX8-NEXT: v_mov_b32_e32 v1, s2
@@ -1132,8 +1112,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_nop 1
+; GFX8-NEXT: s_nop 2
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -1152,12 +1131,12 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5
; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5
; GFX9-NEXT: v_mov_b32_e32 v3, local_var64 at abs32@lo
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
; GFX9-NEXT: v_readfirstlane_b32 s3, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
@@ -1165,8 +1144,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_nop 1
+; GFX9-NEXT: s_nop 2
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -1190,7 +1168,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB5_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
@@ -1223,7 +1200,6 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB5_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
@@ -1266,10 +1242,9 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: BB6_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
@@ -1310,10 +1285,9 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1
; GFX8-NEXT: v_mov_b32_e32 v1, s6
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB6_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1353,10 +1327,9 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: v_mov_b32_e32 v3, local_var64 at abs32@lo
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB6_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -1401,7 +1374,6 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB6_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -1443,7 +1415,6 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2]
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB6_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
@@ -1478,10 +1449,9 @@ define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1]
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1493,10 +1463,9 @@ define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1507,10 +1476,9 @@ define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1]
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1528,7 +1496,6 @@ define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1]
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@@ -1544,7 +1511,6 @@ define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1]
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@@ -1573,18 +1539,17 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: BB8_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1
; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -1603,18 +1568,18 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5
; GFX8-NEXT: v_mov_b32_e32 v2, local_var32 at abs32@lo
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_sub_rtn_u32 v1, v2, v1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB8_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -1632,18 +1597,18 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5
; GFX9-NEXT: v_mov_b32_e32 v2, local_var32 at abs32@lo
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_sub_rtn_u32 v1, v2, v1
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB8_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -1666,7 +1631,6 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB8_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -1697,7 +1661,6 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1032-NEXT: ds_sub_rtn_u32 v1, v2, v1
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB8_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -1736,14 +1699,13 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: BB9_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1
; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0
; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0
@@ -1769,10 +1731,9 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX8-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB9_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1801,10 +1762,9 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX9-NEXT: s_mul_i32 s3, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB9_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -1839,7 +1799,6 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB9_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
@@ -1874,7 +1833,6 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB9_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
@@ -1900,10 +1858,9 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -1945,18 +1902,18 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB10_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -1995,18 +1952,18 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB10_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -2060,7 +2017,6 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB10_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -2112,7 +2068,6 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB10_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -2150,12 +2105,12 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5
; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: BB11_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2
; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0
@@ -2165,7 +2120,6 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -2185,12 +2139,12 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5
; GFX8-NEXT: v_mov_b32_e32 v3, local_var64 at abs32@lo
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB11_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s3, v2
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0
@@ -2200,7 +2154,6 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -2219,12 +2172,12 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5
; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5
; GFX9-NEXT: v_mov_b32_e32 v3, local_var64 at abs32@lo
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB11_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s3, v2
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0
@@ -2234,7 +2187,6 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -2258,7 +2210,6 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB11_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
@@ -2293,7 +2244,6 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB11_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
@@ -2338,10 +2288,9 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: BB12_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
@@ -2382,10 +2331,9 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1
; GFX8-NEXT: v_mov_b32_e32 v1, s6
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB12_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2425,10 +2373,9 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: v_mov_b32_e32 v3, local_var64 at abs32@lo
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB12_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -2473,7 +2420,6 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB12_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -2515,7 +2461,6 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2]
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB12_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
@@ -2550,10 +2495,9 @@ define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1]
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2565,10 +2509,9 @@ define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2579,10 +2522,9 @@ define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1]
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2600,7 +2542,6 @@ define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1]
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
@@ -2616,7 +2557,6 @@ define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1]
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
entry:
@@ -2635,10 +2575,9 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -2680,18 +2619,18 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB14_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: v_and_b32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -2730,18 +2669,18 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB14_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_and_b32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -2795,7 +2734,6 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB14_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -2847,7 +2785,6 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB14_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -2874,10 +2811,9 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -2919,18 +2855,18 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB15_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: v_or_b32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -2969,18 +2905,18 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB15_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_or_b32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3034,7 +2970,6 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB15_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -3086,7 +3021,6 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB15_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -3113,10 +3047,9 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -3158,18 +3091,18 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB16_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -3208,18 +3141,18 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB16_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3273,7 +3206,6 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB16_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -3325,7 +3257,6 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB16_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -3352,10 +3283,9 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -3397,18 +3327,18 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB17_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: v_max_i32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -3447,18 +3377,18 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB17_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_max_i32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3514,7 +3444,6 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB17_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -3568,7 +3497,6 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB17_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -3604,12 +3532,12 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: BB18_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1
@@ -3622,7 +3550,6 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -3640,12 +3567,12 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB18_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, 1
; GFX8-NEXT: v_readfirstlane_b32 s3, v1
@@ -3658,7 +3585,6 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -3675,12 +3601,12 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_mov_b32_e32 v0, 5
; GFX9-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB18_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, 1
; GFX9-NEXT: v_readfirstlane_b32 s3, v1
@@ -3693,7 +3619,6 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3715,7 +3640,6 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB18_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
@@ -3749,7 +3673,6 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1]
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB18_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
@@ -3779,10 +3702,9 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -3824,18 +3746,18 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB19_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: v_min_i32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -3874,18 +3796,18 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB19_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_min_i32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -3941,7 +3863,6 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB19_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -3995,7 +3916,6 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB19_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -4031,12 +3951,12 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: BB20_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2
@@ -4049,7 +3969,6 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -4067,12 +3986,12 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB20_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, -2
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
@@ -4085,7 +4004,6 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -4102,12 +4020,12 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_mov_b32_e32 v0, 5
; GFX9-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB20_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, -2
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
@@ -4120,7 +4038,6 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -4142,7 +4059,6 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB20_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
@@ -4176,7 +4092,6 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1]
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB20_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
@@ -4206,10 +4121,9 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -4251,18 +4165,18 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB21_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: v_max_u32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -4301,18 +4215,18 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB21_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_max_u32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -4366,7 +4280,6 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB21_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -4418,7 +4331,6 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB21_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -4454,12 +4366,12 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: BB22_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
@@ -4471,7 +4383,6 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -4489,12 +4400,12 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB22_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_readfirstlane_b32 s3, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
@@ -4506,7 +4417,6 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -4523,12 +4433,12 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_mov_b32_e32 v0, 5
; GFX9-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB22_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_readfirstlane_b32 s3, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
@@ -4540,7 +4450,6 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -4562,7 +4471,6 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB22_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
@@ -4596,7 +4504,6 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1]
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB22_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
@@ -4626,10 +4533,9 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32 at abs32@lo
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -4671,18 +4577,18 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB23_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: v_min_u32_e32 v0, s2, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -4721,18 +4627,18 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_mov_b32_e32 v0, local_var32 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB23_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_min_u32_e32 v0, s2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -4786,7 +4692,6 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB23_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
@@ -4838,7 +4743,6 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB23_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
@@ -4874,12 +4778,12 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: s_mov_b32 m0, -1
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT: buffer_wbinvl1
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: BB24_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: s_mov_b32 s2, -1
@@ -4891,7 +4795,6 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -4909,12 +4812,12 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: BB24_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -4926,7 +4829,6 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -4943,12 +4845,12 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: v_mov_b32_e32 v0, 5
; GFX9-NEXT: v_mov_b32_e32 v2, local_var64 at abs32@lo
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: BB24_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -4960,7 +4862,6 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -4982,7 +4883,6 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
-; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB24_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
@@ -5016,7 +4916,6 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1]
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
-; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: BB24_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
index 04eac1a9b485..76cd3378f279 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
@@ -13,10 +13,9 @@ define i32 @atomic_nand_i32_lds(i32 addrspace(3)* %ptr) nounwind {
; GCN-NEXT: v_mov_b32_e32 v2, v1
; GCN-NEXT: v_not_b32_e32 v1, v2
; GCN-NEXT: v_or_b32_e32 v1, -5, v1
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_wbinvl1_vol
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 9ba655592859..f8ecd9730504 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -34,7 +34,7 @@ define void @lds_atomic_fadd_noret_f32(float addrspace(3)* %ptr) nounwind {
; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000
; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32
; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64
-; HAS-ATOMICS: s_waitcnt vmcnt(0) lgkmcnt(0)
+; HAS-ATOMICS: s_waitcnt lgkmcnt(0)
; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]]
define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) {
%idx.add = add nuw i32 %idx, 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index afb020f5314c..e928b4f54a76 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -147,7 +147,6 @@ define amdgpu_kernel void @local_agent_acquire_load(
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: ds_read_b32 v0, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_write_b32 v1, v0
; GFX6-NEXT: s_endpgm
@@ -160,7 +159,6 @@ define amdgpu_kernel void @local_agent_acquire_load(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_read_b32 v0, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: ds_write_b32 v1, v0
; GFX7-NEXT: s_endpgm
@@ -174,7 +172,6 @@ define amdgpu_kernel void @local_agent_acquire_load(
; GFX10-WGP-NEXT: ds_read_b32 v0, v0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v1, v0
; GFX10-WGP-NEXT: s_endpgm
;
@@ -186,8 +183,6 @@ define amdgpu_kernel void @local_agent_acquire_load(
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: ds_read_b32 v0, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: ds_write_b32 v1, v0
; GFX10-CU-NEXT: s_endpgm
;
@@ -217,10 +212,9 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_read_b32 v0, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_write_b32 v1, v0
; GFX6-NEXT: s_endpgm
@@ -231,10 +225,9 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_read_b32 v0, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: ds_write_b32 v1, v0
; GFX7-NEXT: s_endpgm
@@ -250,7 +243,6 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX10-WGP-NEXT: ds_read_b32 v0, v0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v1, v0
; GFX10-WGP-NEXT: s_endpgm
;
@@ -260,12 +252,9 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_read_b32 v0, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: ds_write_b32 v1, v0
; GFX10-CU-NEXT: s_endpgm
;
@@ -275,7 +264,7 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
@@ -407,7 +396,7 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -418,7 +407,7 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s0
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -439,8 +428,7 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -451,7 +439,7 @@ define amdgpu_kernel void @local_agent_release_store(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
@@ -469,7 +457,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -480,7 +468,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s0
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -501,8 +489,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -513,7 +500,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
@@ -587,8 +574,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw(
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_agent_acquire_atomicrmw:
@@ -599,8 +585,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_agent_acquire_atomicrmw:
@@ -613,7 +598,6 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_agent_acquire_atomicrmw:
@@ -624,9 +608,6 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_agent_acquire_atomicrmw:
@@ -637,7 +618,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
@@ -654,7 +635,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -665,7 +646,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -686,8 +667,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -698,7 +678,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
@@ -716,10 +696,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_agent_acq_rel_atomicrmw:
@@ -729,10 +708,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_agent_acq_rel_atomicrmw:
@@ -747,7 +725,6 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_agent_acq_rel_atomicrmw:
@@ -756,13 +733,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_atomicrmw:
@@ -772,9 +745,9 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
@@ -791,10 +764,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_agent_seq_cst_atomicrmw:
@@ -804,10 +776,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_agent_seq_cst_atomicrmw:
@@ -822,7 +793,6 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_agent_seq_cst_atomicrmw:
@@ -831,13 +801,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_atomicrmw:
@@ -847,9 +813,9 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
@@ -867,8 +833,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw(
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -880,8 +845,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -895,7 +859,6 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -907,9 +870,6 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -921,7 +881,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
@@ -940,10 +900,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -954,10 +913,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -973,7 +931,6 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -983,13 +940,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1000,9 +953,9 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
@@ -1021,10 +974,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -1035,10 +987,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -1054,7 +1005,6 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -1064,13 +1014,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1081,9 +1027,9 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
@@ -1166,8 +1112,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_agent_acquire_monotonic_cmpxchg:
@@ -1179,8 +1124,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg:
@@ -1194,7 +1138,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_agent_acquire_monotonic_cmpxchg:
@@ -1206,9 +1149,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_agent_acquire_monotonic_cmpxchg:
@@ -1221,7 +1161,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1240,7 +1180,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX6-NEXT: s_endpgm
;
@@ -1252,7 +1192,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX7-NEXT: s_endpgm
;
@@ -1275,8 +1215,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -1289,7 +1228,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -1309,10 +1248,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
@@ -1323,10 +1261,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
@@ -1342,7 +1279,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
@@ -1352,13 +1288,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
@@ -1370,9 +1302,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1391,10 +1323,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
@@ -1405,10 +1336,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
@@ -1424,7 +1354,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
@@ -1434,13 +1363,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
@@ -1452,9 +1377,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1474,8 +1399,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_agent_acquire_acquire_cmpxchg:
@@ -1487,8 +1411,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_agent_acquire_acquire_cmpxchg:
@@ -1502,7 +1425,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_agent_acquire_acquire_cmpxchg:
@@ -1514,9 +1436,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_agent_acquire_acquire_cmpxchg:
@@ -1529,7 +1448,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1548,10 +1467,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_agent_release_acquire_cmpxchg:
@@ -1562,10 +1480,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_agent_release_acquire_cmpxchg:
@@ -1581,7 +1498,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_agent_release_acquire_cmpxchg:
@@ -1591,13 +1507,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_agent_release_acquire_cmpxchg:
@@ -1609,9 +1521,9 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1630,10 +1542,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_agent_acq_rel_acquire_cmpxchg:
@@ -1644,10 +1555,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg:
@@ -1663,7 +1573,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg:
@@ -1673,13 +1582,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_acquire_cmpxchg:
@@ -1691,9 +1596,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1712,10 +1617,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_agent_seq_cst_acquire_cmpxchg:
@@ -1726,10 +1630,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg:
@@ -1745,7 +1648,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg:
@@ -1755,13 +1657,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_acquire_cmpxchg:
@@ -1773,9 +1671,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1794,10 +1692,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
@@ -1808,10 +1705,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
@@ -1827,7 +1723,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
@@ -1837,13 +1732,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
@@ -1855,9 +1746,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1877,8 +1768,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -1891,8 +1781,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -1906,7 +1795,6 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -1918,9 +1806,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1934,7 +1820,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -1956,10 +1842,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -1971,10 +1856,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -1990,7 +1874,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2001,12 +1884,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2019,9 +1899,9 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -2043,10 +1923,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -2058,10 +1937,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -2077,7 +1955,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2088,12 +1965,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2106,9 +1980,9 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -2131,8 +2005,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -2145,8 +2018,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -2160,7 +2032,6 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2172,9 +2043,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2188,7 +2057,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -2210,10 +2079,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -2225,10 +2093,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -2244,7 +2111,6 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2255,12 +2121,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2273,9 +2136,9 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -2297,10 +2160,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -2312,10 +2174,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -2331,7 +2192,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2342,12 +2202,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2360,9 +2217,9 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -2384,10 +2241,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -2399,10 +2255,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -2418,7 +2273,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2429,12 +2283,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2447,9 +2298,9 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -2471,10 +2322,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -2486,10 +2336,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -2505,7 +2354,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2516,12 +2364,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2534,9 +2379,9 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 4ef62030ef5c..5a9bb112aa60 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -147,7 +147,6 @@ define amdgpu_kernel void @local_system_acquire_load(
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: ds_read_b32 v0, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_write_b32 v1, v0
; GFX6-NEXT: s_endpgm
@@ -160,7 +159,6 @@ define amdgpu_kernel void @local_system_acquire_load(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_read_b32 v0, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: ds_write_b32 v1, v0
; GFX7-NEXT: s_endpgm
@@ -174,7 +172,6 @@ define amdgpu_kernel void @local_system_acquire_load(
; GFX10-WGP-NEXT: ds_read_b32 v0, v0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v1, v0
; GFX10-WGP-NEXT: s_endpgm
;
@@ -186,8 +183,6 @@ define amdgpu_kernel void @local_system_acquire_load(
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: ds_read_b32 v0, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: ds_write_b32 v1, v0
; GFX10-CU-NEXT: s_endpgm
;
@@ -217,10 +212,9 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_read_b32 v0, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_write_b32 v1, v0
; GFX6-NEXT: s_endpgm
@@ -231,10 +225,9 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_read_b32 v0, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: ds_write_b32 v1, v0
; GFX7-NEXT: s_endpgm
@@ -250,7 +243,6 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX10-WGP-NEXT: ds_read_b32 v0, v0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v1, v0
; GFX10-WGP-NEXT: s_endpgm
;
@@ -260,12 +252,9 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_read_b32 v0, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: ds_write_b32 v1, v0
; GFX10-CU-NEXT: s_endpgm
;
@@ -275,7 +264,7 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
@@ -407,7 +396,7 @@ define amdgpu_kernel void @local_system_release_store(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -418,7 +407,7 @@ define amdgpu_kernel void @local_system_release_store(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s0
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -439,8 +428,7 @@ define amdgpu_kernel void @local_system_release_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -451,7 +439,7 @@ define amdgpu_kernel void @local_system_release_store(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
@@ -469,7 +457,7 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -480,7 +468,7 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s0
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -501,8 +489,7 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -513,7 +500,7 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
@@ -587,8 +574,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw(
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_system_acquire_atomicrmw:
@@ -599,8 +585,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_system_acquire_atomicrmw:
@@ -613,7 +598,6 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_system_acquire_atomicrmw:
@@ -624,9 +608,6 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_system_acquire_atomicrmw:
@@ -637,7 +618,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
@@ -654,7 +635,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -665,7 +646,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -686,8 +667,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -698,7 +678,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
@@ -716,10 +696,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_system_acq_rel_atomicrmw:
@@ -729,10 +708,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_system_acq_rel_atomicrmw:
@@ -747,7 +725,6 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_system_acq_rel_atomicrmw:
@@ -756,13 +733,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_system_acq_rel_atomicrmw:
@@ -772,9 +745,9 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
@@ -791,10 +764,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_system_seq_cst_atomicrmw:
@@ -804,10 +776,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_system_seq_cst_atomicrmw:
@@ -822,7 +793,6 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_system_seq_cst_atomicrmw:
@@ -831,13 +801,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_system_seq_cst_atomicrmw:
@@ -847,9 +813,9 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
@@ -867,8 +833,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw(
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -880,8 +845,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -895,7 +859,6 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -907,9 +870,6 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw(
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -921,7 +881,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
@@ -940,10 +900,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -954,10 +913,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -973,7 +931,6 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -983,13 +940,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1000,9 +953,9 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
@@ -1021,10 +974,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -1035,10 +987,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -1054,7 +1005,6 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -1064,13 +1014,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1081,9 +1027,9 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
@@ -1166,8 +1112,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_system_acquire_monotonic_cmpxchg:
@@ -1179,8 +1124,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_system_acquire_monotonic_cmpxchg:
@@ -1194,7 +1138,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_system_acquire_monotonic_cmpxchg:
@@ -1206,9 +1149,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_system_acquire_monotonic_cmpxchg:
@@ -1221,7 +1161,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1240,7 +1180,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX6-NEXT: s_endpgm
;
@@ -1252,7 +1192,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX7-NEXT: s_endpgm
;
@@ -1275,8 +1215,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_endpgm
;
@@ -1289,7 +1228,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -1309,10 +1248,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_system_acq_rel_monotonic_cmpxchg:
@@ -1323,10 +1261,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg:
@@ -1342,7 +1279,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg:
@@ -1352,13 +1288,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_system_acq_rel_monotonic_cmpxchg:
@@ -1370,9 +1302,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1391,10 +1323,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_system_seq_cst_monotonic_cmpxchg:
@@ -1405,10 +1336,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg:
@@ -1424,7 +1354,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg:
@@ -1434,13 +1363,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_system_seq_cst_monotonic_cmpxchg:
@@ -1452,9 +1377,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1474,8 +1399,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_system_acquire_acquire_cmpxchg:
@@ -1487,8 +1411,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_system_acquire_acquire_cmpxchg:
@@ -1502,7 +1425,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_system_acquire_acquire_cmpxchg:
@@ -1514,9 +1436,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_system_acquire_acquire_cmpxchg:
@@ -1529,7 +1448,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1548,10 +1467,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_system_release_acquire_cmpxchg:
@@ -1562,10 +1480,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_system_release_acquire_cmpxchg:
@@ -1581,7 +1498,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_system_release_acquire_cmpxchg:
@@ -1591,13 +1507,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_system_release_acquire_cmpxchg:
@@ -1609,9 +1521,9 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1630,10 +1542,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_system_acq_rel_acquire_cmpxchg:
@@ -1644,10 +1555,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg:
@@ -1663,7 +1573,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_system_acq_rel_acquire_cmpxchg:
@@ -1673,13 +1582,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_system_acq_rel_acquire_cmpxchg:
@@ -1691,9 +1596,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1712,10 +1617,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_system_seq_cst_acquire_cmpxchg:
@@ -1726,10 +1630,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg:
@@ -1745,7 +1648,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_system_seq_cst_acquire_cmpxchg:
@@ -1755,13 +1657,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_system_seq_cst_acquire_cmpxchg:
@@ -1773,9 +1671,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1794,10 +1692,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
@@ -1808,10 +1705,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
@@ -1827,7 +1723,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
@@ -1837,13 +1732,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
@@ -1855,9 +1746,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
@@ -1877,8 +1768,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -1891,8 +1781,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -1906,7 +1795,6 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -1918,9 +1806,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -1934,7 +1820,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -1956,10 +1842,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -1971,10 +1856,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -1990,7 +1874,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2001,12 +1884,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2019,9 +1899,9 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -2043,10 +1923,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -2058,10 +1937,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -2077,7 +1955,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2088,12 +1965,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2106,9 +1980,9 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -2131,8 +2005,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -2145,8 +2018,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -2160,7 +2032,6 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2172,9 +2043,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2188,7 +2057,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -2210,10 +2079,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -2225,10 +2093,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -2244,7 +2111,6 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2255,12 +2121,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2273,9 +2136,9 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -2297,10 +2160,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -2312,10 +2174,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -2331,7 +2192,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2342,12 +2202,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2360,9 +2217,9 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -2384,10 +2241,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -2399,10 +2255,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -2418,7 +2273,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2429,12 +2283,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2447,9 +2298,9 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
@@ -2471,10 +2322,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: ds_write_b32 v0, v1
; GFX6-NEXT: s_endpgm
;
@@ -2486,10 +2336,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ds_write_b32 v0, v1
; GFX7-NEXT: s_endpgm
;
@@ -2505,7 +2354,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: buffer_gl0_inv
-; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
; GFX10-WGP-NEXT: s_endpgm
;
@@ -2516,12 +2364,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-CU-NEXT: buffer_gl0_inv
-; GFX10-CU-NEXT: buffer_gl1_inv
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: ds_write_b32 v0, v1
; GFX10-CU-NEXT: s_endpgm
;
@@ -2534,9 +2379,9 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
-; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
More information about the llvm-commits
mailing list