[llvm-branch-commits] [llvm] [AMDGPU] Expand scratch atomics to flat atomics if GAS is enabled (PR #154710)

Pierre van Houtryve via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Aug 21 02:31:42 PDT 2025


https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/154710

None

>From 83f5e4d15007b481510c803daa9046ec9588370b Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 21 Aug 2025 10:43:48 +0200
Subject: [PATCH] [AMDGPU] Expand scratch atomics to flat atomics if GAS is
 enabled

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   50 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |    2 +
 .../AMDGPU/gfx1250-no-scope-cu-stores.ll      |   12 -
 .../AMDGPU/memory-legalizer-private-agent.ll  | 3739 ++++++++++++++---
 .../memory-legalizer-private-singlethread.ll  | 3432 ++++++++++++---
 .../AMDGPU/memory-legalizer-private-system.ll | 3606 +++++++++++++---
 .../memory-legalizer-private-wavefront.ll     | 3432 ++++++++++++---
 .../memory-legalizer-private-workgroup.ll     | 3478 ++++++++++++---
 .../AMDGPU/expand-atomic-private-gas.ll       |  172 +
 9 files changed, 15308 insertions(+), 2615 deletions(-)
 create mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 561019bb65549..60faf211df0d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17808,11 +17808,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
          !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
 }
 
+static TargetLowering::AtomicExpansionKind
+getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
+  // For GAS, lower to flat atomic.
+  return STI.hasGloballyAddressableScratch()
+             ? TargetLowering::AtomicExpansionKind::Expand
+             : TargetLowering::AtomicExpansionKind::NotAtomic;
+}
+
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   unsigned AS = RMW->getPointerAddressSpace();
   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
-    return AtomicExpansionKind::NotAtomic;
+    return getPrivateAtomicExpansionKind(*getSubtarget());
 
   // 64-bit flat atomics that dynamically reside in private memory will silently
   // be dropped.
@@ -18038,14 +18046,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
-             ? AtomicExpansionKind::NotAtomic
+             ? getPrivateAtomicExpansionKind(*getSubtarget())
              : AtomicExpansionKind::None;
 }
 
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
-             ? AtomicExpansionKind::NotAtomic
+             ? getPrivateAtomicExpansionKind(*getSubtarget())
              : AtomicExpansionKind::None;
 }
 
@@ -18053,7 +18061,7 @@ TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
   unsigned AddrSpace = CmpX->getPointerAddressSpace();
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
-    return AtomicExpansionKind::NotAtomic;
+    return getPrivateAtomicExpansionKind(*getSubtarget());
 
   if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
     return AtomicExpansionKind::None;
@@ -18423,9 +18431,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
   Builder.CreateBr(ExitBB);
 }
 
+static void convertScratchAtomicToFlatAtomic(Instruction *I,
+                                             unsigned PtrOpIdx) {
+  Value *PtrOp = I->getOperand(PtrOpIdx);
+  assert(PtrOp->getType()->getPointerAddressSpace() ==
+         AMDGPUAS::PRIVATE_ADDRESS);
+
+  Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
+  Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
+                                              I->getIterator());
+  I->setOperand(PtrOpIdx, ASCast);
+}
+
 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   AtomicRMWInst::BinOp Op = AI->getOperation();
 
+  if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex());
+
   if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
       Op == AtomicRMWInst::Xor) {
     if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
@@ -18448,9 +18471,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
 }
 
 void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+  if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex());
+
   emitExpandAtomicAddrSpacePredicate(CI);
 }
 
+void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
+  if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex());
+
+  llvm_unreachable(
+      "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
+}
+
+void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
+  if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
+
+  llvm_unreachable(
+      "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
+}
+
 LoadInst *
 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   IRBuilder<> Builder(AI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index dedd9ae170774..e96b702367299 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -562,6 +562,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
   void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
   void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
+  void emitExpandAtomicLoad(LoadInst *LI) const override;
+  void emitExpandAtomicStore(StoreInst *SI) const override;
 
   LoadInst *
   lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
index d13d76fcfabf4..fcdba69c30213 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
@@ -86,15 +86,3 @@ entry:
   store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
   ret void
 }
-
-; GCN:     scratch_atomic_store:
-; CU:        scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; NOCU:      scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; GCN:     .amdhsa_kernel scratch_atomic_store
-; CU:        .amdhsa_uses_cu_stores 1
-; NOCU:      .amdhsa_uses_cu_stores 0
-define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
-entry:
-  store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
index af5b529fc387e..fe345f9244066 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
@@ -179,11 +179,35 @@ define amdgpu_kernel void @private_agent_unordered_load(
 ;
 ; GFX1250-LABEL: private_agent_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +382,35 @@ define amdgpu_kernel void @private_agent_monotonic_load(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +585,36 @@ define amdgpu_kernel void @private_agent_acquire_load(
 ;
 ; GFX1250-LABEL: private_agent_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +789,42 @@ define amdgpu_kernel void @private_agent_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +979,35 @@ define amdgpu_kernel void @private_agent_unordered_store(
 ;
 ; GFX1250-LABEL: private_agent_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1032,11 +1160,35 @@ define amdgpu_kernel void @private_agent_monotonic_store(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1189,11 +1341,39 @@ define amdgpu_kernel void @private_agent_release_store(
 ;
 ; GFX1250-LABEL: private_agent_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1346,11 +1526,39 @@ define amdgpu_kernel void @private_agent_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1503,11 +1711,35 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1660,11 +1892,37 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1817,11 +2075,39 @@ define amdgpu_kernel void @private_agent_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1974,11 +2260,41 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -2131,11 +2447,41 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -2344,11 +2690,35 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -2559,11 +2929,41 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -2774,11 +3174,41 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -3017,15 +3447,41 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3263,15 +3719,43 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3509,15 +3993,45 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3755,15 +4269,47 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4001,15 +4547,47 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4247,15 +4825,43 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4493,15 +5099,43 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4739,15 +5373,47 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4985,15 +5651,47 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5231,15 +5929,47 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5477,15 +6207,47 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5723,15 +6485,47 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5969,15 +6763,47 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -6215,15 +7041,47 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -6461,15 +7319,47 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -6736,14 +7626,41 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7013,14 +7930,42 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7290,14 +8235,45 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7567,14 +8543,48 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7844,14 +8854,48 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8121,14 +9165,44 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8398,14 +9472,42 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8675,14 +9777,48 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8952,14 +10088,48 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9229,14 +10399,48 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9506,14 +10710,48 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9783,14 +11021,46 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10060,14 +11330,48 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10337,14 +11641,48 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10614,14 +11952,48 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10798,11 +12170,35 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: private_agent_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -10977,11 +12373,35 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11156,11 +12576,37 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11335,11 +12781,43 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11494,11 +12972,35 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: private_agent_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -11651,11 +13153,35 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -11808,11 +13334,39 @@ define amdgpu_kernel void @private_agent_one_as_release_store(
 ;
 ; GFX1250-LABEL: private_agent_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -11965,11 +13519,39 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -12122,11 +13704,35 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12279,11 +13885,37 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12436,11 +14068,39 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12593,11 +14253,41 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12750,11 +14440,41 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12963,11 +14683,36 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -13178,11 +14923,42 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -13393,11 +15169,42 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -13636,15 +15443,41 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -13882,15 +15715,43 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14128,15 +15989,45 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14374,15 +16265,47 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14620,15 +16543,47 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14866,15 +16821,43 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15112,15 +17095,43 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15358,15 +17369,47 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15604,15 +17647,47 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15850,15 +17925,47 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16096,15 +18203,47 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16342,15 +18481,47 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16588,15 +18759,47 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16834,15 +19037,47 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -17080,15 +19315,47 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -17355,14 +19622,41 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17632,14 +19926,43 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17909,14 +20232,49 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18186,14 +20544,49 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18463,14 +20856,45 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18740,14 +21164,43 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19017,14 +21470,49 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19294,14 +21782,49 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19571,14 +22094,49 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19848,14 +22406,49 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20125,14 +22718,47 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20402,14 +23028,49 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20679,14 +23340,49 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20956,14 +23652,49 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
index ea310ce2c0ffb..9bf9604006637 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
@@ -179,11 +179,35 @@ define amdgpu_kernel void @private_singlethread_unordered_load(
 ;
 ; GFX1250-LABEL: private_singlethread_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +382,35 @@ define amdgpu_kernel void @private_singlethread_monotonic_load(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +585,35 @@ define amdgpu_kernel void @private_singlethread_acquire_load(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +788,35 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +971,35 @@ define amdgpu_kernel void @private_singlethread_unordered_store(
 ;
 ; GFX1250-LABEL: private_singlethread_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1032,11 +1152,35 @@ define amdgpu_kernel void @private_singlethread_monotonic_store(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1189,11 +1333,35 @@ define amdgpu_kernel void @private_singlethread_release_store(
 ;
 ; GFX1250-LABEL: private_singlethread_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1346,11 +1514,35 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1503,11 +1695,35 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1660,11 +1876,35 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1817,11 +2057,35 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1974,11 +2238,35 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -2131,11 +2419,35 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -2344,11 +2656,34 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -2559,11 +2894,34 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -2774,11 +3132,34 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -3017,15 +3398,41 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3263,15 +3670,41 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3509,15 +3942,41 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3755,15 +4214,41 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4001,15 +4486,41 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4247,15 +4758,41 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4493,15 +5030,41 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4739,15 +5302,41 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4985,15 +5574,41 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5231,15 +5846,41 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5477,15 +6118,41 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5723,15 +6390,41 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5969,15 +6662,41 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -6215,15 +6934,41 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -6461,15 +7206,41 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -6736,14 +7507,41 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7013,14 +7811,41 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7290,14 +8115,41 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7567,14 +8419,41 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7844,14 +8723,41 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8121,14 +9027,41 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8398,14 +9331,41 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8675,14 +9635,41 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8952,14 +9939,41 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9229,14 +10243,41 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9506,14 +10547,41 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9783,14 +10851,41 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10060,14 +11155,41 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10337,14 +11459,41 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10614,14 +11763,41 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10798,11 +11974,35 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -10977,11 +12177,35 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11156,11 +12380,35 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11335,11 +12583,35 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11494,11 +12766,35 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -11651,11 +12947,35 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -11808,11 +13128,35 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -11965,11 +13309,35 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -12122,11 +13490,35 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12279,11 +13671,35 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12436,11 +13852,35 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12593,11 +14033,35 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12750,11 +14214,35 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12963,11 +14451,34 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -13178,11 +14689,34 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -13393,11 +14927,34 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -13636,15 +15193,41 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -13882,15 +15465,41 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14128,15 +15737,41 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14374,15 +16009,41 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14620,15 +16281,41 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14866,15 +16553,41 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15112,15 +16825,41 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15358,15 +17097,41 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15604,15 +17369,41 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15850,15 +17641,41 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16096,15 +17913,41 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16342,15 +18185,41 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16588,15 +18457,41 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16834,15 +18729,41 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -17080,15 +19001,41 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -17355,14 +19302,41 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17632,14 +19606,41 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17909,14 +19910,41 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp
 ; GFX1250-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18186,14 +20214,41 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18463,14 +20518,41 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18740,14 +20822,41 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19017,14 +21126,41 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19294,14 +21430,41 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc
 ; GFX1250-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19571,14 +21734,41 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19848,14 +22038,41 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20125,14 +22342,41 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20402,14 +22646,41 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20679,14 +22950,41 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc
 ; GFX1250-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20956,14 +23254,41 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -21233,14 +23558,41 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
index 2dac745ae605b..c8113ff441f76 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
@@ -179,11 +179,35 @@ define amdgpu_kernel void @private_system_unordered_load(
 ;
 ; GFX1250-LABEL: private_system_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +382,35 @@ define amdgpu_kernel void @private_system_monotonic_load(
 ;
 ; GFX1250-LABEL: private_system_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +585,36 @@ define amdgpu_kernel void @private_system_acquire_load(
 ;
 ; GFX1250-LABEL: private_system_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +789,42 @@ define amdgpu_kernel void @private_system_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +979,35 @@ define amdgpu_kernel void @private_system_unordered_store(
 ;
 ; GFX1250-LABEL: private_system_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1032,11 +1160,35 @@ define amdgpu_kernel void @private_system_monotonic_store(
 ;
 ; GFX1250-LABEL: private_system_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1189,11 +1341,40 @@ define amdgpu_kernel void @private_system_release_store(
 ;
 ; GFX1250-LABEL: private_system_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1346,11 +1527,40 @@ define amdgpu_kernel void @private_system_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1503,11 +1713,35 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1660,11 +1894,37 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1817,11 +2077,40 @@ define amdgpu_kernel void @private_system_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1974,11 +2263,42 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -2131,11 +2451,42 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -2344,11 +2695,35 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -2559,11 +2934,42 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -2774,11 +3180,42 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -3017,15 +3454,41 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3263,15 +3726,43 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3509,15 +4000,46 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3755,15 +4277,48 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4001,15 +4556,48 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4247,15 +4835,43 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4493,15 +5109,43 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4739,15 +5383,48 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4985,15 +5662,48 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5231,15 +5941,48 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5477,15 +6220,48 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5752,14 +6528,41 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6029,14 +6832,42 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6306,14 +7137,49 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6583,14 +7449,49 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6860,14 +7761,44 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7137,14 +8068,42 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7414,14 +8373,49 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7691,14 +8685,49 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7968,14 +8997,49 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8245,14 +9309,49 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8522,14 +9621,47 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8799,14 +9931,49 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9076,14 +10243,49 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9353,14 +10555,49 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9537,11 +10774,35 @@ define amdgpu_kernel void @private_system_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: private_system_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -9716,11 +10977,35 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -9895,11 +11180,37 @@ define amdgpu_kernel void @private_system_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -10074,11 +11385,43 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -10233,11 +11576,35 @@ define amdgpu_kernel void @private_system_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: private_system_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -10390,11 +11757,35 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -10547,11 +11938,40 @@ define amdgpu_kernel void @private_system_one_as_release_store(
 ;
 ; GFX1250-LABEL: private_system_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -10704,11 +12124,40 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -10861,11 +12310,35 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -11018,11 +12491,37 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -11175,11 +12674,40 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -11332,11 +12860,42 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -11489,11 +13048,42 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -11702,11 +13292,36 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -11917,11 +13532,43 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -12132,11 +13779,43 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -12375,15 +14054,41 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -12621,15 +14326,43 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -12867,15 +14600,46 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -13113,15 +14877,48 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -13359,15 +15156,48 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -13605,15 +15435,43 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -13851,15 +15709,43 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14097,15 +15983,48 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14343,15 +16262,48 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14589,15 +16541,48 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14835,15 +16820,48 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15081,15 +17099,48 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15327,15 +17378,48 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15573,15 +17657,48 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15819,15 +17936,48 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16094,14 +18244,41 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg
 ; GFX1250-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16371,14 +18548,43 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16648,14 +18854,46 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16925,14 +19163,50 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17202,14 +19476,50 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17479,14 +19789,45 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17756,14 +20097,43 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18033,14 +20403,50 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18310,14 +20716,50 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18587,14 +21029,50 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18864,14 +21342,50 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19141,14 +21655,48 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19418,14 +21966,50 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19695,14 +22279,50 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19972,14 +22592,50 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_storecnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_bvhcnt 0x0
+; GFX1250-NEXT:    s_wait_samplecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
index b628994778a38..91f2bde07fafa 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
@@ -179,11 +179,35 @@ define amdgpu_kernel void @private_wavefront_unordered_load(
 ;
 ; GFX1250-LABEL: private_wavefront_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +382,35 @@ define amdgpu_kernel void @private_wavefront_monotonic_load(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +585,35 @@ define amdgpu_kernel void @private_wavefront_acquire_load(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +788,35 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +971,35 @@ define amdgpu_kernel void @private_wavefront_unordered_store(
 ;
 ; GFX1250-LABEL: private_wavefront_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1032,11 +1152,35 @@ define amdgpu_kernel void @private_wavefront_monotonic_store(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1189,11 +1333,35 @@ define amdgpu_kernel void @private_wavefront_release_store(
 ;
 ; GFX1250-LABEL: private_wavefront_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1346,11 +1514,35 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1503,11 +1695,35 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1660,11 +1876,35 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1817,11 +2057,35 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1974,11 +2238,35 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -2131,11 +2419,35 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -2344,11 +2656,34 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -2559,11 +2894,34 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -2774,11 +3132,34 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -3017,15 +3398,41 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3263,15 +3670,41 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3509,15 +3942,41 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3755,15 +4214,41 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4001,15 +4486,41 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4247,15 +4758,41 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4493,15 +5030,41 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4739,15 +5302,41 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4985,15 +5574,41 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5231,15 +5846,41 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5477,15 +6118,41 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5723,15 +6390,41 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5969,15 +6662,41 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -6215,15 +6934,41 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -6461,15 +7206,41 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -6736,14 +7507,41 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
    ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7013,14 +7811,41 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7290,14 +8115,41 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
    ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7567,14 +8419,41 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7844,14 +8723,41 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8121,14 +9027,41 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8398,14 +9331,41 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8675,14 +9635,41 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8952,14 +9939,41 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9229,14 +10243,41 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9506,14 +10547,41 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9783,14 +10851,41 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10060,14 +11155,41 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10337,14 +11459,41 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10614,14 +11763,41 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10798,11 +11974,35 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -10977,11 +12177,35 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11156,11 +12380,35 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11335,11 +12583,35 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11494,11 +12766,35 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -11651,11 +12947,35 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -11808,11 +13128,35 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -11965,11 +13309,35 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -12122,11 +13490,35 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12279,11 +13671,35 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12436,11 +13852,35 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12593,11 +14033,35 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12750,11 +14214,35 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12963,11 +14451,34 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -13178,11 +14689,34 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -13393,11 +14927,34 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -13636,15 +15193,41 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -13882,15 +15465,41 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14128,15 +15737,41 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14374,15 +16009,41 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14620,15 +16281,41 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14866,15 +16553,41 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15112,15 +16825,41 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15358,15 +17097,41 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15604,15 +17369,41 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15850,15 +17641,41 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16096,15 +17913,41 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16342,15 +18185,41 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16588,15 +18457,41 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16834,15 +18729,41 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -17080,15 +19001,41 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -17355,14 +19302,41 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17632,14 +19606,41 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17909,14 +19910,41 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch
 ; GFX1250-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18186,14 +20214,41 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18463,14 +20518,41 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18740,14 +20822,41 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19017,14 +21126,41 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19294,14 +21430,41 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19571,14 +21734,41 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19848,14 +22038,41 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20125,14 +22342,41 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20402,14 +22646,41 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20679,14 +22950,41 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20956,14 +23254,41 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -21233,14 +23558,41 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
index a27b40a8209ec..f16bef5b69582 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
@@ -179,11 +179,35 @@ define amdgpu_kernel void @private_workgroup_unordered_load(
 ;
 ; GFX1250-LABEL: private_workgroup_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -358,11 +382,35 @@ define amdgpu_kernel void @private_workgroup_monotonic_load(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -537,11 +585,35 @@ define amdgpu_kernel void @private_workgroup_acquire_load(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -716,11 +788,36 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -875,11 +972,35 @@ define amdgpu_kernel void @private_workgroup_unordered_store(
 ;
 ; GFX1250-LABEL: private_workgroup_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1032,11 +1153,35 @@ define amdgpu_kernel void @private_workgroup_monotonic_store(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1189,11 +1334,36 @@ define amdgpu_kernel void @private_workgroup_release_store(
 ;
 ; GFX1250-LABEL: private_workgroup_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1346,11 +1516,36 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -1503,11 +1698,35 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1660,11 +1879,36 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1817,11 +2061,36 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -1974,11 +2243,37 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -2131,11 +2426,37 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -2344,11 +2665,34 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -2559,11 +2903,35 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -2774,11 +3142,35 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -3017,15 +3409,41 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3263,15 +3681,42 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3509,15 +3954,42 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -3755,15 +4227,43 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4001,15 +4501,43 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4247,15 +4775,42 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4493,15 +5048,42 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4739,15 +5321,43 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -4985,15 +5595,43 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5231,15 +5869,43 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5477,15 +6143,43 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5723,15 +6417,43 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -5969,15 +6691,43 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -6215,15 +6965,43 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -6461,15 +7239,43 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -6736,14 +7542,41 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7013,14 +7846,41 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7290,14 +8150,42 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7567,14 +8455,42 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7844,14 +8760,42 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8121,14 +9065,41 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8398,14 +9369,41 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8675,14 +9673,42 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -8952,14 +9978,42 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9229,14 +10283,42 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9506,14 +10588,42 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -9783,14 +10893,42 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10060,14 +11198,42 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10337,14 +11503,42 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10614,14 +11808,42 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -10798,11 +12020,35 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -10977,11 +12223,35 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11156,11 +12426,35 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11335,11 +12629,35 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s1
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %in, ptr addrspace(5) %out) {
@@ -11494,11 +12812,35 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -11651,11 +12993,35 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -11808,11 +13174,35 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -11965,11 +13355,35 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     i32 %in, ptr addrspace(5) %out) {
 entry:
@@ -12122,11 +13536,35 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12279,11 +13717,35 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12436,11 +13898,35 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12593,11 +14079,35 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12750,11 +14260,35 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT:    s_mov_b32 s1, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_mov_b32_e32 v0, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s1, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s1, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; implicit-def: $sgpr1
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
 entry:
@@ -12963,11 +14497,34 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -13178,11 +14735,34 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -13393,11 +14973,34 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 scope:SCOPE_SE
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s3, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in) {
@@ -13636,15 +15239,41 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -13882,15 +15511,41 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14128,15 +15783,41 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14374,15 +16055,41 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14620,15 +16327,41 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -14866,15 +16599,41 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15112,15 +16871,41 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15358,15 +17143,41 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15604,15 +17415,41 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -15850,15 +17687,41 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16096,15 +17959,41 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16342,15 +18231,41 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16588,15 +18503,41 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -16834,15 +18775,41 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -17080,15 +19047,41 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT:    s_mov_b32 s2, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1250-NEXT:    s_mov_b32 s2, s5
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; implicit-def: $sgpr2
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s0
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
 entry:
@@ -17355,14 +19348,41 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17632,14 +19652,41 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17909,14 +19956,41 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch
 ; GFX1250-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18186,14 +20260,41 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18463,14 +20564,41 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18740,14 +20868,41 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19017,14 +21172,41 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19294,14 +21476,41 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19571,14 +21780,41 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19848,14 +22084,41 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20125,14 +22388,41 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20402,14 +22692,41 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20679,14 +22996,41 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -20956,14 +23300,41 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -21233,14 +23604,41 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT:    s_mov_b32 s3, 16
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    scratch_load_b32 v0, off, s0 offset:16
-; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_cmp_eq_u32_e64 s2, v0, s2
-; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v0, s1, s2
-; GFX1250-NEXT:    scratch_store_b32 off, v1, s0 offset:16 scope:SCOPE_SE
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 0
+; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT:    s_mov_b32 s3, 20
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT:    v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1250-NEXT:    s_mov_b64 s[6:7], 0
+; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s5, -1
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    s_mov_b32 s3, s6
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; implicit-def: $sgpr3
+; GFX1250-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v4, s1
+; GFX1250-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT:    v_mov_b32_e32 v3, v4
+; GFX1250-NEXT:    flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
     ptr addrspace(5) %out, i32 %in, i32 %old) {
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll
new file mode 100644
index 0000000000000..7c9d6ab332657
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-private-gas.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=GFX1200 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes=atomic-expand %s | FileCheck -check-prefixes=GFX1250 %s
+
+define void @system_atomic_store_unordered_float(ptr addrspace(5) %addr, float %val) {
+; GFX1200-LABEL: define void @system_atomic_store_unordered_float(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], float [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX1200-NEXT:    store float [[VAL]], ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT:    ret void
+;
+; GFX1250-LABEL: define void @system_atomic_store_unordered_float(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], float [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX1250-NEXT:    [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT:    store atomic float [[VAL]], ptr [[SCRATCH_ASCAST]] unordered, align 4
+; GFX1250-NEXT:    ret void
+;
+  store atomic float %val, ptr addrspace(5) %addr unordered, align 4
+  ret void
+}
+
+define void @system_atomic_store_unordered_i32(ptr addrspace(5) %addr, i32 %val) {
+; GFX1200-LABEL: define void @system_atomic_store_unordered_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT:    store i32 [[VAL]], ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT:    ret void
+;
+; GFX1250-LABEL: define void @system_atomic_store_unordered_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT:    [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT:    store atomic i32 [[VAL]], ptr [[SCRATCH_ASCAST]] unordered, align 4
+; GFX1250-NEXT:    ret void
+;
+  store atomic i32 %val, ptr addrspace(5) %addr unordered, align 4
+  ret void
+}
+
+define void @system_atomic_store_release_i32(ptr addrspace(5) %addr, i32 %val) {
+; GFX1200-LABEL: define void @system_atomic_store_release_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT:    store i32 [[VAL]], ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT:    ret void
+;
+; GFX1250-LABEL: define void @system_atomic_store_release_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT:    [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT:    store atomic i32 [[VAL]], ptr [[SCRATCH_ASCAST]] release, align 4
+; GFX1250-NEXT:    ret void
+;
+  store atomic i32 %val, ptr addrspace(5) %addr release, align 4
+  ret void
+}
+
+define void @workgroup_atomic_store_release_i32(ptr addrspace(5) %addr, i32 %val) {
+; GFX1200-LABEL: define void @workgroup_atomic_store_release_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT:    store i32 [[VAL]], ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT:    ret void
+;
+; GFX1250-LABEL: define void @workgroup_atomic_store_release_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT:    [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT:    store atomic i32 [[VAL]], ptr [[SCRATCH_ASCAST]] syncscope("workgroup") release, align 4
+; GFX1250-NEXT:    ret void
+;
+  store atomic i32 %val, ptr addrspace(5) %addr syncscope("workgroup") release, align 4
+  ret void
+}
+
+define float @system_atomic_load_unordered_float(ptr addrspace(5) %addr) {
+; GFX1200-LABEL: define float @system_atomic_load_unordered_float(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT:    [[VAL:%.*]] = load float, ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT:    ret float [[VAL]]
+;
+; GFX1250-LABEL: define float @system_atomic_load_unordered_float(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT:    [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT:    [[VAL:%.*]] = load atomic float, ptr [[SCRATCH_ASCAST]] unordered, align 4
+; GFX1250-NEXT:    ret float [[VAL]]
+;
+  %val = load atomic float, ptr addrspace(5) %addr unordered, align 4
+  ret float %val
+}
+
+define i32 @system_atomic_load_unordered_i32(ptr addrspace(5) %addr) {
+; GFX1200-LABEL: define i32 @system_atomic_load_unordered_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT:    ret i32 [[VAL]]
+;
+; GFX1250-LABEL: define i32 @system_atomic_load_unordered_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT:    [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT:    [[VAL:%.*]] = load atomic i32, ptr [[SCRATCH_ASCAST]] unordered, align 4
+; GFX1250-NEXT:    ret i32 [[VAL]]
+;
+  %val = load atomic i32, ptr addrspace(5) %addr unordered, align 4
+  ret i32 %val
+}
+
+define i32 @system_atomic_load_acquire_i32(ptr addrspace(5) %addr) {
+; GFX1200-LABEL: define i32 @system_atomic_load_acquire_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT:    ret i32 [[VAL]]
+;
+; GFX1250-LABEL: define i32 @system_atomic_load_acquire_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT:    [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT:    [[VAL:%.*]] = load atomic i32, ptr [[SCRATCH_ASCAST]] acquire, align 4
+; GFX1250-NEXT:    ret i32 [[VAL]]
+;
+  %val = load atomic i32, ptr addrspace(5) %addr acquire, align 4
+  ret i32 %val
+}
+
+define i32 @workgroup_atomic_load_acquire_i32(ptr addrspace(5) %addr) {
+; GFX1200-LABEL: define i32 @workgroup_atomic_load_acquire_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT:    ret i32 [[VAL]]
+;
+; GFX1250-LABEL: define i32 @workgroup_atomic_load_acquire_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT:    [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT:    [[VAL:%.*]] = load atomic i32, ptr [[SCRATCH_ASCAST]] syncscope("workgroup") acquire, align 4
+; GFX1250-NEXT:    ret i32 [[VAL]]
+;
+  %val = load atomic i32, ptr addrspace(5) %addr syncscope("workgroup") acquire, align 4
+  ret i32 %val
+}
+
+define i32 @system_atomic_cmpxchg_acq_rel_acquire_i32(ptr addrspace(5) %addr, i32 %old, i32 %in) {
+; GFX1200-LABEL: define i32 @system_atomic_cmpxchg_acq_rel_acquire_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[OLD:%.*]], i32 [[IN:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[OLD]]
+; GFX1200-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[IN]], i32 [[TMP1]]
+; GFX1200-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT:    [[TMP4:%.*]] = insertvalue { i32, i1 } poison, i32 [[TMP1]], 0
+; GFX1200-NEXT:    [[TMP5:%.*]] = insertvalue { i32, i1 } [[TMP4]], i1 [[TMP2]], 1
+; GFX1200-NEXT:    [[RES:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX1200-NEXT:    ret i32 [[RES]]
+;
+; GFX1250-LABEL: define i32 @system_atomic_cmpxchg_acq_rel_acquire_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[OLD:%.*]], i32 [[IN:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT:    [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT:    [[VAL:%.*]] = cmpxchg volatile ptr [[SCRATCH_ASCAST]], i32 [[OLD]], i32 [[IN]] acq_rel acquire, align 4
+; GFX1250-NEXT:    [[RES:%.*]] = extractvalue { i32, i1 } [[VAL]], 0
+; GFX1250-NEXT:    ret i32 [[RES]]
+;
+  %val = cmpxchg volatile ptr addrspace(5) %addr, i32 %old, i32 %in acq_rel acquire
+  %res = extractvalue { i32, i1 } %val, 0
+  ret i32 %res
+}
+
+define i32 @system_atomicrmw_add_acq_rel_i32(ptr addrspace(5) %addr, i32 %in) {
+; GFX1200-LABEL: define i32 @system_atomicrmw_add_acq_rel_i32(
+; GFX1200-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[IN:%.*]]) #[[ATTR0]] {
+; GFX1200-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT:    store i32 [[IN]], ptr addrspace(5) [[ADDR]], align 4
+; GFX1200-NEXT:    ret i32 [[TMP1]]
+;
+; GFX1250-LABEL: define i32 @system_atomicrmw_add_acq_rel_i32(
+; GFX1250-SAME: ptr addrspace(5) [[ADDR:%.*]], i32 [[IN:%.*]]) #[[ATTR0]] {
+; GFX1250-NEXT:    [[SCRATCH_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ADDR]] to ptr
+; GFX1250-NEXT:    [[VAL:%.*]] = atomicrmw volatile xchg ptr [[SCRATCH_ASCAST]], i32 [[IN]] acq_rel, align 4
+; GFX1250-NEXT:    ret i32 [[VAL]]
+;
+  %val = atomicrmw volatile xchg ptr addrspace(5) %addr, i32 %in acq_rel
+  ret i32 %val
+}



More information about the llvm-branch-commits mailing list