[llvm] 9ad8e12 - [AMDGPU] Expand scratch atomics to flat atomics if GAS is enabled (#154710)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 1 01:17:44 PDT 2025
Author: Pierre van Houtryve
Date: 2025-09-01T10:17:38+02:00
New Revision: 9ad8e12c573f248b3f0ca4cca01fee1e5ed22c7b
URL: https://github.com/llvm/llvm-project/commit/9ad8e12c573f248b3f0ca4cca01fee1e5ed22c7b
DIFF: https://github.com/llvm/llvm-project/commit/9ad8e12c573f248b3f0ca4cca01fee1e5ed22c7b.diff
LOG: [AMDGPU] Expand scratch atomics to flat atomics if GAS is enabled (#154710)
Added:
llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.h
llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7b3c77106cb89..a861d9a96c9e3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17773,11 +17773,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
!AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
}
+static TargetLowering::AtomicExpansionKind
+getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
+ // For GAS, lower to flat atomic.
+ return STI.hasGloballyAddressableScratch()
+ ? TargetLowering::AtomicExpansionKind::CustomExpand
+ : TargetLowering::AtomicExpansionKind::NotAtomic;
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
- return AtomicExpansionKind::NotAtomic;
+ return getPrivateAtomicExpansionKind(*getSubtarget());
// 64-bit flat atomics that dynamically reside in private memory will silently
// be dropped.
@@ -18048,14 +18056,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
+ ? getPrivateAtomicExpansionKind(*getSubtarget())
: AtomicExpansionKind::None;
}
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
+ ? getPrivateAtomicExpansionKind(*getSubtarget())
: AtomicExpansionKind::None;
}
@@ -18063,7 +18071,7 @@ TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
unsigned AddrSpace = CmpX->getPointerAddressSpace();
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
- return AtomicExpansionKind::NotAtomic;
+ return getPrivateAtomicExpansionKind(*getSubtarget());
if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
return AtomicExpansionKind::None;
@@ -18433,9 +18441,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
Builder.CreateBr(ExitBB);
}
+static void convertScratchAtomicToFlatAtomic(Instruction *I,
+ unsigned PtrOpIdx) {
+ Value *PtrOp = I->getOperand(PtrOpIdx);
+ assert(PtrOp->getType()->getPointerAddressSpace() ==
+ AMDGPUAS::PRIVATE_ADDRESS);
+
+ Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
+ Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
+ I->getIterator());
+ I->setOperand(PtrOpIdx, ASCast);
+}
+
void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
AtomicRMWInst::BinOp Op = AI->getOperation();
+ if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex());
+
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
Op == AtomicRMWInst::Xor) {
if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
@@ -18458,9 +18481,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
}
void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+ if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex());
+
emitExpandAtomicAddrSpacePredicate(CI);
}
+void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
+ if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex());
+
+ llvm_unreachable(
+ "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
+}
+
+void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
+ if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
+
+ llvm_unreachable(
+ "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
+}
+
LoadInst *
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
IRBuilder<> Builder(AI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index f6c24a40c44f8..728c6490bdfd6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -562,6 +562,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
+ void emitExpandAtomicLoad(LoadInst *LI) const override;
+ void emitExpandAtomicStore(StoreInst *SI) const override;
LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
index d13d76fcfabf4..fcdba69c30213 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
@@ -86,15 +86,3 @@ entry:
store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
ret void
}
-
-; GCN: scratch_atomic_store:
-; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; GCN: .amdhsa_kernel scratch_atomic_store
-; CU: .amdhsa_uses_cu_stores 1
-; NOCU: .amdhsa_uses_cu_stores 0
-define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
-entry:
- store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
index af5b529fc387e..4ca0cc92e09be 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
@@ -179,11 +179,32 @@ define amdgpu_kernel void @private_agent_unordered_load(
;
; GFX1250-LABEL: private_agent_unordered_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +379,32 @@ define amdgpu_kernel void @private_agent_monotonic_load(
;
; GFX1250-LABEL: private_agent_monotonic_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +579,33 @@ define amdgpu_kernel void @private_agent_acquire_load(
;
; GFX1250-LABEL: private_agent_acquire_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +780,39 @@ define amdgpu_kernel void @private_agent_seq_cst_load(
;
; GFX1250-LABEL: private_agent_seq_cst_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +967,32 @@ define amdgpu_kernel void @private_agent_unordered_store(
;
; GFX1250-LABEL: private_agent_unordered_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1032,11 +1145,32 @@ define amdgpu_kernel void @private_agent_monotonic_store(
;
; GFX1250-LABEL: private_agent_monotonic_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1189,11 +1323,36 @@ define amdgpu_kernel void @private_agent_release_store(
;
; GFX1250-LABEL: private_agent_release_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1346,11 +1505,36 @@ define amdgpu_kernel void @private_agent_seq_cst_store(
;
; GFX1250-LABEL: private_agent_seq_cst_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1503,11 +1687,32 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw(
;
; GFX1250-LABEL: private_agent_monotonic_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1660,11 +1865,34 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw(
;
; GFX1250-LABEL: private_agent_acquire_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1817,11 +2045,36 @@ define amdgpu_kernel void @private_agent_release_atomicrmw(
;
; GFX1250-LABEL: private_agent_release_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1974,11 +2227,38 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw(
;
; GFX1250-LABEL: private_agent_acq_rel_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -2131,11 +2411,38 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw(
;
; GFX1250-LABEL: private_agent_seq_cst_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -2344,11 +2651,32 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2559,11 +2887,38 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2774,11 +3129,38 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -3017,15 +3399,38 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_agent_monotonic_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3263,15 +3668,40 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_agent_acquire_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3509,15 +3939,42 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_agent_release_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3755,15 +4212,44 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4001,15 +4487,44 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4247,15 +4762,40 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg(
;
; GFX1250-LABEL: private_agent_monotonic_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4493,15 +5033,40 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg(
;
; GFX1250-LABEL: private_agent_acquire_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4739,15 +5304,44 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg(
;
; GFX1250-LABEL: private_agent_release_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4985,15 +5579,44 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg(
;
; GFX1250-LABEL: private_agent_acq_rel_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5231,15 +5854,44 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg(
;
; GFX1250-LABEL: private_agent_seq_cst_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5477,15 +6129,44 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5723,15 +6404,44 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_agent_acquire_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5969,15 +6679,44 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_agent_release_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6215,15 +6954,44 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6461,15 +7229,44 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6736,14 +7533,38 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7013,14 +7834,39 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7290,14 +8136,42 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_agent_release_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7567,14 +8441,45 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7844,14 +8749,45 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8121,14 +9057,41 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8398,14 +9361,39 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8675,14 +9663,45 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_agent_release_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8952,14 +9971,45 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9229,14 +10279,45 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9506,14 +10587,45 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9783,14 +10895,43 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10060,14 +11201,45 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10337,14 +11509,45 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10614,14 +11817,45 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10798,11 +12032,32 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load(
;
; GFX1250-LABEL: private_agent_one_as_unordered_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -10977,11 +12232,32 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load(
;
; GFX1250-LABEL: private_agent_one_as_monotonic_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11156,11 +12432,34 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load(
;
; GFX1250-LABEL: private_agent_one_as_acquire_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11335,11 +12634,40 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load(
;
; GFX1250-LABEL: private_agent_one_as_seq_cst_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11494,11 +12822,32 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store(
;
; GFX1250-LABEL: private_agent_one_as_unordered_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -11651,11 +13000,32 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store(
;
; GFX1250-LABEL: private_agent_one_as_monotonic_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -11808,11 +13178,36 @@ define amdgpu_kernel void @private_agent_one_as_release_store(
;
; GFX1250-LABEL: private_agent_one_as_release_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -11965,11 +13360,36 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store(
;
; GFX1250-LABEL: private_agent_one_as_seq_cst_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -12122,11 +13542,32 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw(
;
; GFX1250-LABEL: private_agent_one_as_monotonic_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12279,11 +13720,34 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw(
;
; GFX1250-LABEL: private_agent_one_as_acquire_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12436,11 +13900,36 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw(
;
; GFX1250-LABEL: private_agent_one_as_release_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12593,11 +14082,38 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw(
;
; GFX1250-LABEL: private_agent_one_as_acq_rel_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12750,11 +14266,38 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw(
;
; GFX1250-LABEL: private_agent_one_as_seq_cst_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12963,11 +14506,33 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13178,11 +14743,39 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13393,11 +14986,39 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13636,15 +15257,38 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -13882,15 +15526,40 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14128,15 +15797,42 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14374,15 +16070,44 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14620,15 +16345,44 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14866,15 +16620,40 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15112,15 +16891,40 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15358,15 +17162,44 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_release_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15604,15 +17437,44 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15850,15 +17712,44 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16096,15 +17987,44 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16342,15 +18262,44 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16588,15 +18537,44 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16834,15 +18812,44 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -17080,15 +19087,44 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -17355,14 +19391,38 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17632,14 +19692,40 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17909,14 +19995,46 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18186,14 +20304,46 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18463,14 +20613,42 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18740,14 +20918,40 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19017,14 +21221,46 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19294,14 +21530,46 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19571,14 +21839,46 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19848,14 +22148,46 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20125,14 +22457,44 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20402,14 +22764,46 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20679,14 +23073,46 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20956,14 +23382,46 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
index ea310ce2c0ffb..e9ee6b4925a13 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
@@ -179,11 +179,32 @@ define amdgpu_kernel void @private_singlethread_unordered_load(
;
; GFX1250-LABEL: private_singlethread_unordered_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +379,32 @@ define amdgpu_kernel void @private_singlethread_monotonic_load(
;
; GFX1250-LABEL: private_singlethread_monotonic_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +579,32 @@ define amdgpu_kernel void @private_singlethread_acquire_load(
;
; GFX1250-LABEL: private_singlethread_acquire_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +779,32 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load(
;
; GFX1250-LABEL: private_singlethread_seq_cst_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +959,32 @@ define amdgpu_kernel void @private_singlethread_unordered_store(
;
; GFX1250-LABEL: private_singlethread_unordered_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1032,11 +1137,32 @@ define amdgpu_kernel void @private_singlethread_monotonic_store(
;
; GFX1250-LABEL: private_singlethread_monotonic_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1189,11 +1315,32 @@ define amdgpu_kernel void @private_singlethread_release_store(
;
; GFX1250-LABEL: private_singlethread_release_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1346,11 +1493,32 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store(
;
; GFX1250-LABEL: private_singlethread_seq_cst_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1503,11 +1671,32 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw(
;
; GFX1250-LABEL: private_singlethread_monotonic_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1660,11 +1849,32 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw(
;
; GFX1250-LABEL: private_singlethread_acquire_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1817,11 +2027,32 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw(
;
; GFX1250-LABEL: private_singlethread_release_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1974,11 +2205,32 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw(
;
; GFX1250-LABEL: private_singlethread_acq_rel_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -2131,11 +2383,32 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw(
;
; GFX1250-LABEL: private_singlethread_seq_cst_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -2344,11 +2617,31 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2559,11 +2852,31 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2774,11 +3087,31 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -3017,15 +3350,38 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3263,15 +3619,38 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3509,15 +3888,38 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_release_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3755,15 +4157,38 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4001,15 +4426,38 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4247,15 +4695,38 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4493,15 +4964,38 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_acquire_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4739,15 +5233,38 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_release_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4985,15 +5502,38 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5231,15 +5771,38 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5477,15 +6040,38 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5723,15 +6309,38 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5969,15 +6578,38 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_release_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6215,15 +6847,38 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6461,15 +7116,38 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6736,14 +7414,38 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7013,14 +7715,38 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7290,14 +8016,38 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7567,14 +8317,38 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7844,14 +8618,38 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8121,14 +8919,38 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8398,14 +9220,38 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8675,14 +9521,38 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8952,14 +9822,38 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9229,14 +10123,38 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9506,14 +10424,38 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9783,14 +10725,38 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10060,14 +11026,38 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10337,14 +11327,38 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10614,14 +11628,38 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10798,11 +11836,32 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load(
;
; GFX1250-LABEL: private_singlethread_one_as_unordered_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -10977,11 +12036,32 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load(
;
; GFX1250-LABEL: private_singlethread_one_as_monotonic_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11156,11 +12236,32 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load(
;
; GFX1250-LABEL: private_singlethread_one_as_acquire_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11335,11 +12436,32 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load(
;
; GFX1250-LABEL: private_singlethread_one_as_seq_cst_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11494,11 +12616,32 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store(
;
; GFX1250-LABEL: private_singlethread_one_as_unordered_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -11651,11 +12794,32 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store(
;
; GFX1250-LABEL: private_singlethread_one_as_monotonic_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -11808,11 +12972,32 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store(
;
; GFX1250-LABEL: private_singlethread_one_as_release_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -11965,11 +13150,32 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store(
;
; GFX1250-LABEL: private_singlethread_one_as_seq_cst_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -12122,11 +13328,32 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw(
;
; GFX1250-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12279,11 +13506,32 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw(
;
; GFX1250-LABEL: private_singlethread_one_as_acquire_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12436,11 +13684,32 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw(
;
; GFX1250-LABEL: private_singlethread_one_as_release_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12593,11 +13862,32 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX1250-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12750,11 +14040,32 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX1250-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12963,11 +14274,31 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13178,11 +14509,31 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13393,11 +14744,31 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13636,15 +15007,38 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc
;
; GFX1250-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -13882,15 +15276,38 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg
;
; GFX1250-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14128,15 +15545,38 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg
;
; GFX1250-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14374,15 +15814,38 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg
;
; GFX1250-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14620,15 +16083,38 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg
;
; GFX1250-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14866,15 +16352,38 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg
;
; GFX1250-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15112,15 +16621,38 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15358,15 +16890,38 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15604,15 +17159,38 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15850,15 +17428,38 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16096,15 +17697,38 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg
;
; GFX1250-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16342,15 +17966,38 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16588,15 +18235,38 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16834,15 +18504,38 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -17080,15 +18773,38 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -17355,14 +19071,38 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c
; GFX1250-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17632,14 +19372,38 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp
; GFX1250-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17909,14 +19673,38 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp
; GFX1250-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18186,14 +19974,38 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp
; GFX1250-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18463,14 +20275,38 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp
; GFX1250-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18740,14 +20576,38 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp
; GFX1250-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19017,14 +20877,38 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc
; GFX1250-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19294,14 +21178,38 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc
; GFX1250-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19571,14 +21479,38 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc
; GFX1250-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19848,14 +21780,38 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc
; GFX1250-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20125,14 +22081,38 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp
; GFX1250-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20402,14 +22382,38 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc
; GFX1250-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20679,14 +22683,38 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc
; GFX1250-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20956,14 +22984,38 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc
; GFX1250-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -21233,14 +23285,38 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc
; GFX1250-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
index 2dac745ae605b..24ec3a34c4e6e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
@@ -179,11 +179,32 @@ define amdgpu_kernel void @private_system_unordered_load(
;
; GFX1250-LABEL: private_system_unordered_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +379,32 @@ define amdgpu_kernel void @private_system_monotonic_load(
;
; GFX1250-LABEL: private_system_monotonic_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +579,33 @@ define amdgpu_kernel void @private_system_acquire_load(
;
; GFX1250-LABEL: private_system_acquire_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +780,39 @@ define amdgpu_kernel void @private_system_seq_cst_load(
;
; GFX1250-LABEL: private_system_seq_cst_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +967,32 @@ define amdgpu_kernel void @private_system_unordered_store(
;
; GFX1250-LABEL: private_system_unordered_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1032,11 +1145,32 @@ define amdgpu_kernel void @private_system_monotonic_store(
;
; GFX1250-LABEL: private_system_monotonic_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1189,11 +1323,37 @@ define amdgpu_kernel void @private_system_release_store(
;
; GFX1250-LABEL: private_system_release_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1346,11 +1506,37 @@ define amdgpu_kernel void @private_system_seq_cst_store(
;
; GFX1250-LABEL: private_system_seq_cst_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1503,11 +1689,32 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw(
;
; GFX1250-LABEL: private_system_monotonic_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1660,11 +1867,34 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw(
;
; GFX1250-LABEL: private_system_acquire_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1817,11 +2047,37 @@ define amdgpu_kernel void @private_system_release_atomicrmw(
;
; GFX1250-LABEL: private_system_release_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1974,11 +2230,39 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw(
;
; GFX1250-LABEL: private_system_acq_rel_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -2131,11 +2415,39 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw(
;
; GFX1250-LABEL: private_system_seq_cst_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -2344,11 +2656,32 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2559,11 +2892,39 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2774,11 +3135,39 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -3017,15 +3406,38 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_system_monotonic_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3263,15 +3675,40 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_system_acquire_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3509,15 +3946,43 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_system_release_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3755,15 +4220,45 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_system_acq_rel_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4001,15 +4496,45 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_system_seq_cst_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4247,15 +4772,40 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg(
;
; GFX1250-LABEL: private_system_monotonic_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4493,15 +5043,40 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg(
;
; GFX1250-LABEL: private_system_acquire_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4739,15 +5314,45 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg(
;
; GFX1250-LABEL: private_system_release_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4985,15 +5590,45 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg(
;
; GFX1250-LABEL: private_system_acq_rel_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5231,15 +5866,45 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg(
;
; GFX1250-LABEL: private_system_seq_cst_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5477,15 +6142,45 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5752,14 +6447,38 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6029,14 +6748,39 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6306,14 +7050,46 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6583,14 +7359,46 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6860,14 +7668,41 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7137,14 +7972,39 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_system_acquire_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7414,14 +8274,46 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_system_release_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7691,14 +8583,46 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7968,14 +8892,46 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8245,14 +9201,46 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8522,14 +9510,44 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8799,14 +9817,46 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9076,14 +10126,46 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9353,14 +10435,46 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9537,11 +10651,32 @@ define amdgpu_kernel void @private_system_one_as_unordered_load(
;
; GFX1250-LABEL: private_system_one_as_unordered_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -9716,11 +10851,32 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load(
;
; GFX1250-LABEL: private_system_one_as_monotonic_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -9895,11 +11051,34 @@ define amdgpu_kernel void @private_system_one_as_acquire_load(
;
; GFX1250-LABEL: private_system_one_as_acquire_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -10074,11 +11253,40 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load(
;
; GFX1250-LABEL: private_system_one_as_seq_cst_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -10233,11 +11441,32 @@ define amdgpu_kernel void @private_system_one_as_unordered_store(
;
; GFX1250-LABEL: private_system_one_as_unordered_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -10390,11 +11619,32 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store(
;
; GFX1250-LABEL: private_system_one_as_monotonic_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -10547,11 +11797,37 @@ define amdgpu_kernel void @private_system_one_as_release_store(
;
; GFX1250-LABEL: private_system_one_as_release_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -10704,11 +11980,37 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store(
;
; GFX1250-LABEL: private_system_one_as_seq_cst_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -10861,11 +12163,32 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw(
;
; GFX1250-LABEL: private_system_one_as_monotonic_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -11018,11 +12341,34 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw(
;
; GFX1250-LABEL: private_system_one_as_acquire_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -11175,11 +12521,37 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw(
;
; GFX1250-LABEL: private_system_one_as_release_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -11332,11 +12704,39 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw(
;
; GFX1250-LABEL: private_system_one_as_acq_rel_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -11489,11 +12889,39 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw(
;
; GFX1250-LABEL: private_system_one_as_seq_cst_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -11702,11 +13130,33 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -11917,11 +13367,40 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -12132,11 +13611,40 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -12375,15 +13883,38 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -12621,15 +14152,40 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -12867,15 +14423,43 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_release_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -13113,15 +14697,45 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -13359,15 +14973,45 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -13605,15 +15249,40 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -13851,15 +15520,40 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14097,15 +15791,45 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_release_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14343,15 +16067,45 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14589,15 +16343,45 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14835,15 +16619,45 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15081,15 +16895,45 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15327,15 +17171,45 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15573,15 +17447,45 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15819,15 +17723,45 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16094,14 +18028,38 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg
; GFX1250-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16371,14 +18329,40 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16648,14 +18632,43 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16925,14 +18938,47 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17202,14 +19248,47 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17479,14 +19558,42 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17756,14 +19863,40 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18033,14 +20166,47 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18310,14 +20476,47 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18587,14 +20786,47 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18864,14 +21096,47 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19141,14 +21406,45 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19418,14 +21714,47 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19695,14 +22024,47 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19972,14 +22334,47 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
index b628994778a38..8b2254412c0c8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
@@ -179,11 +179,32 @@ define amdgpu_kernel void @private_wavefront_unordered_load(
;
; GFX1250-LABEL: private_wavefront_unordered_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +379,32 @@ define amdgpu_kernel void @private_wavefront_monotonic_load(
;
; GFX1250-LABEL: private_wavefront_monotonic_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +579,32 @@ define amdgpu_kernel void @private_wavefront_acquire_load(
;
; GFX1250-LABEL: private_wavefront_acquire_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +779,32 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load(
;
; GFX1250-LABEL: private_wavefront_seq_cst_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +959,32 @@ define amdgpu_kernel void @private_wavefront_unordered_store(
;
; GFX1250-LABEL: private_wavefront_unordered_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1032,11 +1137,32 @@ define amdgpu_kernel void @private_wavefront_monotonic_store(
;
; GFX1250-LABEL: private_wavefront_monotonic_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1189,11 +1315,32 @@ define amdgpu_kernel void @private_wavefront_release_store(
;
; GFX1250-LABEL: private_wavefront_release_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1346,11 +1493,32 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store(
;
; GFX1250-LABEL: private_wavefront_seq_cst_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1503,11 +1671,32 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw(
;
; GFX1250-LABEL: private_wavefront_monotonic_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1660,11 +1849,32 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw(
;
; GFX1250-LABEL: private_wavefront_acquire_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1817,11 +2027,32 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw(
;
; GFX1250-LABEL: private_wavefront_release_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1974,11 +2205,32 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw(
;
; GFX1250-LABEL: private_wavefront_acq_rel_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -2131,11 +2383,32 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw(
;
; GFX1250-LABEL: private_wavefront_seq_cst_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -2344,11 +2617,31 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2559,11 +2852,31 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2774,11 +3087,31 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -3017,15 +3350,38 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3263,15 +3619,38 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3509,15 +3888,38 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_release_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3755,15 +4157,38 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4001,15 +4426,38 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4247,15 +4695,38 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4493,15 +4964,38 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_acquire_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4739,15 +5233,38 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_release_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4985,15 +5502,38 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5231,15 +5771,38 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5477,15 +6040,38 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5723,15 +6309,38 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5969,15 +6578,38 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_release_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6215,15 +6847,38 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6461,15 +7116,38 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6736,14 +7414,38 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7013,14 +7715,38 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7290,14 +8016,38 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7567,14 +8317,38 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7844,14 +8618,38 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8121,14 +8919,38 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8398,14 +9220,38 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8675,14 +9521,38 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8952,14 +9822,38 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9229,14 +10123,38 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9506,14 +10424,38 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9783,14 +10725,38 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10060,14 +11026,38 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10337,14 +11327,38 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10614,14 +11628,38 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10798,11 +11836,32 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load(
;
; GFX1250-LABEL: private_wavefront_one_as_unordered_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -10977,11 +12036,32 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load(
;
; GFX1250-LABEL: private_wavefront_one_as_monotonic_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11156,11 +12236,32 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load(
;
; GFX1250-LABEL: private_wavefront_one_as_acquire_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11335,11 +12436,32 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load(
;
; GFX1250-LABEL: private_wavefront_one_as_seq_cst_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11494,11 +12616,32 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store(
;
; GFX1250-LABEL: private_wavefront_one_as_unordered_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -11651,11 +12794,32 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store(
;
; GFX1250-LABEL: private_wavefront_one_as_monotonic_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -11808,11 +12972,32 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store(
;
; GFX1250-LABEL: private_wavefront_one_as_release_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -11965,11 +13150,32 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store(
;
; GFX1250-LABEL: private_wavefront_one_as_seq_cst_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -12122,11 +13328,32 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw(
;
; GFX1250-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12279,11 +13506,32 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw(
;
; GFX1250-LABEL: private_wavefront_one_as_acquire_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12436,11 +13684,32 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw(
;
; GFX1250-LABEL: private_wavefront_one_as_release_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12593,11 +13862,32 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX1250-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12750,11 +14040,32 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX1250-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12963,11 +14274,31 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13178,11 +14509,31 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13393,11 +14744,31 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13636,15 +15007,38 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -13882,15 +15276,38 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14128,15 +15545,38 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14374,15 +15814,38 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14620,15 +16083,38 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14866,15 +16352,38 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15112,15 +16621,38 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15358,15 +16890,38 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15604,15 +17159,38 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15850,15 +17428,38 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16096,15 +17697,38 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16342,15 +17966,38 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16588,15 +18235,38 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16834,15 +18504,38 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -17080,15 +18773,38 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -17355,14 +19071,38 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx
; GFX1250-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17632,14 +19372,38 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch
; GFX1250-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17909,14 +19673,38 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch
; GFX1250-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18186,14 +19974,38 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch
; GFX1250-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18463,14 +20275,38 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch
; GFX1250-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18740,14 +20576,38 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch
; GFX1250-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19017,14 +20877,38 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19294,14 +21178,38 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19571,14 +21479,38 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19848,14 +21780,38 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20125,14 +22081,38 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch
; GFX1250-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20402,14 +22382,38 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20679,14 +22683,38 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20956,14 +22984,38 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -21233,14 +23285,38 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
index a27b40a8209ec..127434c365f95 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
@@ -179,11 +179,32 @@ define amdgpu_kernel void @private_workgroup_unordered_load(
;
; GFX1250-LABEL: private_workgroup_unordered_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +379,32 @@ define amdgpu_kernel void @private_workgroup_monotonic_load(
;
; GFX1250-LABEL: private_workgroup_monotonic_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +579,32 @@ define amdgpu_kernel void @private_workgroup_acquire_load(
;
; GFX1250-LABEL: private_workgroup_acquire_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +779,33 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load(
;
; GFX1250-LABEL: private_workgroup_seq_cst_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +960,32 @@ define amdgpu_kernel void @private_workgroup_unordered_store(
;
; GFX1250-LABEL: private_workgroup_unordered_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1032,11 +1138,32 @@ define amdgpu_kernel void @private_workgroup_monotonic_store(
;
; GFX1250-LABEL: private_workgroup_monotonic_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1189,11 +1316,33 @@ define amdgpu_kernel void @private_workgroup_release_store(
;
; GFX1250-LABEL: private_workgroup_release_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1346,11 +1495,33 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store(
;
; GFX1250-LABEL: private_workgroup_seq_cst_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -1503,11 +1674,32 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw(
;
; GFX1250-LABEL: private_workgroup_monotonic_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1660,11 +1852,33 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw(
;
; GFX1250-LABEL: private_workgroup_acquire_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1817,11 +2031,33 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw(
;
; GFX1250-LABEL: private_workgroup_release_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -1974,11 +2210,34 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw(
;
; GFX1250-LABEL: private_workgroup_acq_rel_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -2131,11 +2390,34 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw(
;
; GFX1250-LABEL: private_workgroup_seq_cst_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -2344,11 +2626,31 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2559,11 +2861,32 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2774,11 +3097,32 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -3017,15 +3361,38 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3263,15 +3630,39 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3509,15 +3900,39 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_release_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3755,15 +4170,40 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4001,15 +4441,40 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4247,15 +4712,39 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4493,15 +4982,39 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_acquire_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4739,15 +5252,40 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_release_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4985,15 +5523,40 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5231,15 +5794,40 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5477,15 +6065,40 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5723,15 +6336,40 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5969,15 +6607,40 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_release_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6215,15 +6878,40 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6461,15 +7149,40 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6736,14 +7449,38 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7013,14 +7750,38 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7290,14 +8051,39 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7567,14 +8353,39 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7844,14 +8655,39 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8121,14 +8957,38 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8398,14 +9258,38 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8675,14 +9559,39 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8952,14 +9861,39 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9229,14 +10163,39 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9506,14 +10465,39 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9783,14 +10767,39 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10060,14 +11069,39 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10337,14 +11371,39 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10614,14 +11673,39 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10798,11 +11882,32 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load(
;
; GFX1250-LABEL: private_workgroup_one_as_unordered_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -10977,11 +12082,32 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load(
;
; GFX1250-LABEL: private_workgroup_one_as_monotonic_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11156,11 +12282,32 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load(
;
; GFX1250-LABEL: private_workgroup_one_as_acquire_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11335,11 +12482,32 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load(
;
; GFX1250-LABEL: private_workgroup_one_as_seq_cst_load:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s1
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11494,11 +12662,32 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store(
;
; GFX1250-LABEL: private_workgroup_one_as_unordered_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -11651,11 +12840,32 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store(
;
; GFX1250-LABEL: private_workgroup_one_as_monotonic_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -11808,11 +13018,32 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store(
;
; GFX1250-LABEL: private_workgroup_one_as_release_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -11965,11 +13196,32 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store(
;
; GFX1250-LABEL: private_workgroup_one_as_seq_cst_store:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
entry:
@@ -12122,11 +13374,32 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw(
;
; GFX1250-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12279,11 +13552,32 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw(
;
; GFX1250-LABEL: private_workgroup_one_as_acquire_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12436,11 +13730,32 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw(
;
; GFX1250-LABEL: private_workgroup_one_as_release_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12593,11 +13908,32 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX1250-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12750,11 +14086,32 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX1250-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -12963,11 +14320,31 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13178,11 +14555,31 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13393,11 +14790,31 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13636,15 +15053,38 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -13882,15 +15322,38 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14128,15 +15591,38 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14374,15 +15860,38 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14620,15 +16129,38 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -14866,15 +16398,38 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15112,15 +16667,38 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15358,15 +16936,38 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15604,15 +17205,38 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15850,15 +17474,38 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16096,15 +17743,38 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16342,15 +18012,38 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16588,15 +18281,38 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16834,15 +18550,38 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -17080,15 +18819,38 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX1250-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -17355,14 +19117,38 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx
; GFX1250-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17632,14 +19418,38 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch
; GFX1250-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17909,14 +19719,38 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch
; GFX1250-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18186,14 +20020,38 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch
; GFX1250-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18463,14 +20321,38 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch
; GFX1250-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18740,14 +20622,38 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch
; GFX1250-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19017,14 +20923,38 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19294,14 +21224,38 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19571,14 +21525,38 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19848,14 +21826,38 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20125,14 +22127,38 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch
; GFX1250-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20402,14 +22428,38 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20679,14 +22729,38 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20956,14 +23030,38 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -21233,14 +23331,38 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT: scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll
new file mode 100644
index 0000000000000..2731f2118a261
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=GFX1200 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes=atomic-expand %s | FileCheck -check-prefixes=GFX1250 %s
+
+define void @system_atomic_store_unordered_float(ptr addrspace(5) %addr, float %val) {
+; GFX1200-LABEL: define void @system_atomic_store_unordered_float(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], float [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX1200-NEXT: store float [[VAL]], ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT: ret void
+;
+; GFX1250-LABEL: define void @system_atomic_store_unordered_float(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], float [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX1250-NEXT: [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT: store atomic float [[VAL]], ptr [[SCRATCH_ASCAST]] unordered, align 4
+; GFX1250-NEXT: ret void
+;
+ store atomic float %val, ptr addrspace(5) %addr unordered, align 4
+ ret void
+}
+
+define void @system_atomic_store_unordered_i32(ptr addrspace(5) %addr, i32 %val) {
+; GFX1200-LABEL: define void @system_atomic_store_unordered_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT: store i32 [[VAL]], ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT: ret void
+;
+; GFX1250-LABEL: define void @system_atomic_store_unordered_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT: [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT: store atomic i32 [[VAL]], ptr [[SCRATCH_ASCAST]] unordered, align 4
+; GFX1250-NEXT: ret void
+;
+ store atomic i32 %val, ptr addrspace(5) %addr unordered, align 4
+ ret void
+}
+
+define void @system_atomic_store_release_i32(ptr addrspace(5) %addr, i32 %val) {
+; GFX1200-LABEL: define void @system_atomic_store_release_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT: store i32 [[VAL]], ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT: ret void
+;
+; GFX1250-LABEL: define void @system_atomic_store_release_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT: [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT: store atomic i32 [[VAL]], ptr [[SCRATCH_ASCAST]] release, align 4
+; GFX1250-NEXT: ret void
+;
+ store atomic i32 %val, ptr addrspace(5) %addr release, align 4
+ ret void
+}
+
+define void @workgroup_atomic_store_release_i32(ptr addrspace(5) %addr, i32 %val) {
+; GFX1200-LABEL: define void @workgroup_atomic_store_release_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT: store i32 [[VAL]], ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT: ret void
+;
+; GFX1250-LABEL: define void @workgroup_atomic_store_release_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT: [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT: store atomic i32 [[VAL]], ptr [[SCRATCH_ASCAST]] syncscope("workgroup") release, align 4
+; GFX1250-NEXT: ret void
+;
+ store atomic i32 %val, ptr addrspace(5) %addr syncscope("workgroup") release, align 4
+ ret void
+}
+
+define float @system_atomic_load_unordered_float(ptr addrspace(5) %addr) {
+; GFX1200-LABEL: define float @system_atomic_load_unordered_float(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT: [[VAL:%.*]] = load float, ptr addrspace(5) [[ADDR]], align 4, !invariant.load [[META0:![0-9]+]], !nontemporal [[META1:![0-9]+]]
+; GFX1200-NEXT: ret float [[VAL]]
+;
+; GFX1250-LABEL: define float @system_atomic_load_unordered_float(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT: [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT: [[VAL:%.*]] = load atomic float, ptr [[SCRATCH_ASCAST]] unordered, align 4, !invariant.load [[META0:![0-9]+]], !nontemporal [[META1:![0-9]+]]
+; GFX1250-NEXT: ret float [[VAL]]
+;
+ %val = load atomic float, ptr addrspace(5) %addr unordered, align 4, !invariant.load !1, !nontemporal !0
+ ret float %val
+}
+
+define i32 @system_atomic_load_unordered_i32(ptr addrspace(5) %addr) {
+; GFX1200-LABEL: define i32 @system_atomic_load_unordered_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT: ret i32 [[VAL]]
+;
+; GFX1250-LABEL: define i32 @system_atomic_load_unordered_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT: [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT: [[VAL:%.*]] = load atomic i32, ptr [[SCRATCH_ASCAST]] unordered, align 4
+; GFX1250-NEXT: ret i32 [[VAL]]
+;
+ %val = load atomic i32, ptr addrspace(5) %addr unordered, align 4
+ ret i32 %val
+}
+
+define i32 @system_atomic_load_acquire_i32(ptr addrspace(5) %addr) {
+; GFX1200-LABEL: define i32 @system_atomic_load_acquire_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT: ret i32 [[VAL]]
+;
+; GFX1250-LABEL: define i32 @system_atomic_load_acquire_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT: [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT: [[VAL:%.*]] = load atomic i32, ptr [[SCRATCH_ASCAST]] acquire, align 4
+; GFX1250-NEXT: ret i32 [[VAL]]
+;
+ %val = load atomic i32, ptr addrspace(5) %addr acquire, align 4
+ ret i32 %val
+}
+
+define i32 @workgroup_atomic_load_acquire_i32(ptr addrspace(5) %addr) {
+; GFX1200-LABEL: define i32 @workgroup_atomic_load_acquire_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT: ret i32 [[VAL]]
+;
+; GFX1250-LABEL: define i32 @workgroup_atomic_load_acquire_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT: [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT: [[VAL:%.*]] = load atomic i32, ptr [[SCRATCH_ASCAST]] syncscope("workgroup") acquire, align 4
+; GFX1250-NEXT: ret i32 [[VAL]]
+;
+ %val = load atomic i32, ptr addrspace(5) %addr syncscope("workgroup") acquire, align 4
+ ret i32 %val
+}
+
+define i32 @system_atomic_cmpxchg_acq_rel_acquire_i32(ptr addrspace(5) %addr, i32 %old, i32 %in) {
+; GFX1200-LABEL: define i32 @system_atomic_cmpxchg_acq_rel_acquire_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[OLD:%.*]], i32 [[IN:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[OLD]]
+; GFX1200-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[IN]], i32 [[TMP1]]
+; GFX1200-NEXT: store i32 [[TMP3]], ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT: [[TMP4:%.*]] = insertvalue { i32, i1 } poison, i32 [[TMP1]], 0
+; GFX1200-NEXT: [[TMP5:%.*]] = insertvalue { i32, i1 } [[TMP4]], i1 [[TMP2]], 1
+; GFX1200-NEXT: [[RES:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX1200-NEXT: ret i32 [[RES]]
+;
+; GFX1250-LABEL: define i32 @system_atomic_cmpxchg_acq_rel_acquire_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[OLD:%.*]], i32 [[IN:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT: [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT: [[VAL:%.*]] = cmpxchg volatile ptr [[SCRATCH_ASCAST]], i32 [[OLD]], i32 [[IN]] acq_rel acquire, align 4, !nontemporal [[META1]]
+; GFX1250-NEXT: [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0
+; GFX1250-NEXT: ret i32 [[RES]]
+;
+ %val = cmpxchg volatile ptr addrspace(5) %addr, i32 %old, i32 %in acq_rel acquire, !nontemporal !0
+ %res = extractvalue { i32, i1 } %val, 0
+ ret i32 %res
+}
+
+define i32 @system_atomicrmw_xchg_acq_rel_i32(ptr addrspace(5) %addr, i32 %in) {
+; GFX1200-LABEL: define i32 @system_atomicrmw_xchg_acq_rel_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[IN:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT: store i32 [[IN]], ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT: ret i32 [[TMP1]]
+;
+; GFX1250-LABEL: define i32 @system_atomicrmw_xchg_acq_rel_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[IN:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT: [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT: [[VAL:%.*]] = atomicrmw volatile xchg ptr [[SCRATCH_ASCAST]], i32 [[IN]] acq_rel, align 4
+; GFX1250-NEXT: ret i32 [[VAL]]
+;
+ %val = atomicrmw volatile xchg ptr addrspace(5) %addr, i32 %in acq_rel
+ ret i32 %val
+}
+
+define i16 @system_atomicrmw_xchg_acq_rel_i16(ptr addrspace(5) %addr, i16 %in) {
+; GFX1200-LABEL: define i16 @system_atomicrmw_xchg_acq_rel_i16(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], i16 [[IN:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT: [[TMP1:%.*]] = load i16, ptr addrspace(5) [[ADDR]], align 2
+; GFX1200-NEXT: store i16 [[IN]], ptr addrspace(5) [[ADDR]], align 2
+; GFX1200-NEXT: ret i16 [[TMP1]]
+;
+; GFX1250-LABEL: define i16 @system_atomicrmw_xchg_acq_rel_i16(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], i16 [[IN:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT: [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT: [[VAL:%.*]] = atomicrmw volatile xchg ptr [[SCRATCH_ASCAST]], i16 [[IN]] acq_rel, align 2
+; GFX1250-NEXT: ret i16 [[VAL]]
+;
+ %val = atomicrmw volatile xchg ptr addrspace(5) %addr, i16 %in acq_rel
+ ret i16 %val
+}
+
+define half @system_atomicrmw_fmax_acq_rel_half(ptr addrspace(5) %addr, half %in) {
+; GFX1200-LABEL: define half @system_atomicrmw_fmax_acq_rel_half(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], half [[IN:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT: [[TMP1:%.*]] = load half, ptr addrspace(5) [[ADDR]], align 2
+; GFX1200-NEXT: [[TMP2:%.*]] = call half @llvm.maxnum.f16(half [[TMP1]], half [[IN]])
+; GFX1200-NEXT: store half [[TMP2]], ptr addrspace(5) [[ADDR]], align 2
+; GFX1200-NEXT: ret half [[TMP1]]
+;
+; GFX1250-LABEL: define half @system_atomicrmw_fmax_acq_rel_half(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], half [[IN:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT: [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT: [[VAL:%.*]] = atomicrmw volatile fmax ptr [[SCRATCH_ASCAST]], half [[IN]] acq_rel, align 2
+; GFX1250-NEXT: ret half [[VAL]]
+;
+ %val = atomicrmw volatile fmax ptr addrspace(5) %addr, half %in acq_rel
+ ret half %val
+}
+
+define float @system_atomicrmw_fminimum_acq_rel_float(ptr addrspace(5) %addr, float %in) {
+; GFX1200-LABEL: define float @system_atomicrmw_fminimum_acq_rel_float(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], float [[IN:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT: [[TMP2:%.*]] = call float @llvm.minimum.f32(float [[TMP1]], float [[IN]])
+; GFX1200-NEXT: store float [[TMP2]], ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT: ret float [[TMP1]]
+;
+; GFX1250-LABEL: define float @system_atomicrmw_fminimum_acq_rel_float(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], float [[IN:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT: [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT: [[VAL:%.*]] = atomicrmw volatile fminimum ptr [[SCRATCH_ASCAST]], float [[IN]] acq_rel, align 4, !nontemporal [[META1]]
+; GFX1250-NEXT: ret float [[VAL]]
+;
+ %val = atomicrmw volatile fminimum ptr addrspace(5) %addr, float %in acq_rel, !nontemporal !0
+ ret float %val
+}
+
+!0 = !{}
+!1 = !{i32 1}
+;.
+; GFX1200: [[META0]] = !{i32 1}
+; GFX1200: [[META1]] = !{}
+;.
+; GFX1250: [[META0]] = !{i32 1}
+; GFX1250: [[META1]] = !{}
+;.
More information about the llvm-commits
mailing list